diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index 2c870d1171658..8e33920e82a21 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -465,6 +465,54 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {MultiDomain ? X86::VPMOVZXDQZ128rm : 0, 2, 32, rebuildZExtCst}}; return FixupConstant(Fixups, 128, 1); } + case X86::VMOVAPDZ128rmk: + case X86::VMOVUPDZ128rmk: { + FixupEntry Fixups[] = { + {MultiDomain ? X86::VPMOVSXBQZ128rmk : 0, 2, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBQZ128rmk : 0, 2, 8, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXWQZ128rmk : 0, 2, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWQZ128rmk : 0, 2, 16, rebuildZExtCst}, + {X86::VMOVSDZrmk, 1, 64, rebuildZeroUpperCst}, + {X86::VMOVDDUPZ128rmk, 1, 64, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXDQZ128rmk : 0, 2, 32, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXDQZ128rmk : 0, 2, 32, rebuildZExtCst}}; + return FixupConstant(Fixups, 128, 3); + } + case X86::VMOVAPDZ128rmkz: + case X86::VMOVUPDZ128rmkz: { + FixupEntry Fixups[] = { + {MultiDomain ? X86::VPMOVSXBQZ128rmkz : 0, 2, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBQZ128rmkz : 0, 2, 8, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXWQZ128rmkz : 0, 2, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWQZ128rmkz : 0, 2, 16, rebuildZExtCst}, + {X86::VMOVSDZrmkz, 1, 64, rebuildZeroUpperCst}, + {X86::VMOVDDUPZ128rmkz, 1, 64, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXDQZ128rmkz : 0, 2, 32, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXDQZ128rmkz : 0, 2, 32, rebuildZExtCst}}; + return FixupConstant(Fixups, 128, 2); + } + case X86::VMOVAPSZ128rmk: + case X86::VMOVUPSZ128rmk: { + FixupEntry Fixups[] = { + {X86::VMOVSSZrmk, 1, 32, rebuildZeroUpperCst}, + {X86::VBROADCASTSSZ128rmk, 1, 32, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXBDZ128rmk : 0, 4, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBDZ128rmk : 0, 4, 8, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXWDZ128rmk : 0, 4, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWDZ128rmk : 0, 4, 16, rebuildZExtCst}}; + return FixupConstant(Fixups, 128, 3); + } + case X86::VMOVAPSZ128rmkz: + case X86::VMOVUPSZ128rmkz: { + FixupEntry Fixups[] = { + {X86::VMOVSSZrmkz, 1, 32, rebuildZeroUpperCst}, + {X86::VBROADCASTSSZ128rmkz, 1, 32, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXBDZ128rmkz : 0, 4, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBDZ128rmkz : 0, 4, 8, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXWDZ128rmkz : 0, 4, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWDZ128rmkz : 0, 4, 16, rebuildZExtCst}}; + return FixupConstant(Fixups, 128, 2); + } case X86::VMOVAPDZ256rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPDZ256rm: @@ -485,6 +533,52 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {MultiDomain ? X86::VPMOVZXDQZ256rm : 0, 4, 32, rebuildZExtCst}}; return FixupConstant(Fixups, 256, 1); } + case X86::VMOVAPDZ256rmk: + case X86::VMOVUPDZ256rmk: { + FixupEntry Fixups[] = { + {MultiDomain ? X86::VPMOVSXBQZ256rmk : 0, 4, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBQZ256rmk : 0, 4, 8, rebuildZExtCst}, + {X86::VBROADCASTSDZ256rmk, 1, 64, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXWQZ256rmk : 0, 4, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWQZ256rmk : 0, 4, 16, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXDQZ256rmk : 0, 4, 32, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXDQZ256rmk : 0, 4, 32, rebuildZExtCst}}; + return FixupConstant(Fixups, 256, 3); + } + case X86::VMOVAPDZ256rmkz: + case X86::VMOVUPDZ256rmkz: { + FixupEntry Fixups[] = { + {MultiDomain ? X86::VPMOVSXBQZ256rmkz : 0, 4, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBQZ256rmkz : 0, 4, 8, rebuildZExtCst}, + {X86::VBROADCASTSDZ256rmkz, 1, 64, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXWQZ256rmkz : 0, 4, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWQZ256rmkz : 0, 4, 16, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXDQZ256rmkz : 0, 4, 32, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXDQZ256rmkz : 0, 4, 32, rebuildZExtCst}}; + return FixupConstant(Fixups, 256, 2); + } + case X86::VMOVAPSZ256rmk: + case X86::VMOVUPSZ256rmk: { + FixupEntry Fixups[] = { + {X86::VBROADCASTSSZ256rmk, 1, 32, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXBDZ256rmk : 0, 8, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBDZ256rmk : 0, 8, 8, rebuildZExtCst}, + {X86::VBROADCASTF32X4Z256rmk, 1, 128, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXWDZ256rmk : 0, 8, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWDZ256rmk : 0, 8, 16, rebuildZExtCst}}; + return FixupConstant(Fixups, 256, 3); + } + case X86::VMOVAPSZ256rmkz: + case X86::VMOVUPSZ256rmkz: { + FixupEntry Fixups[] = { + {X86::VBROADCASTSSZ256rmkz, 1, 32, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXBDZ256rmkz : 0, 8, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBDZ256rmkz : 0, 8, 8, rebuildZExtCst}, + {X86::VBROADCASTF32X4Z256rmkz, 1, 128, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXWDZ256rmkz : 0, 8, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWDZ256rmkz : 0, 8, 16, rebuildZExtCst}}; + return FixupConstant(Fixups, 256, 2); + } case X86::VMOVAPDZrm: case X86::VMOVAPSZrm: case X86::VMOVUPDZrm: @@ -505,6 +599,54 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {MultiDomain ? X86::VPMOVSXDQZrm : 0, 8, 32, rebuildSExtCst}, {MultiDomain ? X86::VPMOVZXDQZrm : 0, 8, 32, rebuildZExtCst}}; return FixupConstant(Fixups, 512, 1); + } + case X86::VMOVAPDZrmk: + case X86::VMOVUPDZrmk: { + FixupEntry Fixups[] = { + {X86::VBROADCASTSDZrmk, 1, 64, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXBQZrmk : 0, 8, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBQZrmk : 0, 8, 8, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXWQZrmk : 0, 8, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWQZrmk : 0, 8, 16, rebuildZExtCst}, + {X86::VBROADCASTF64X4Zrmk, 1, 256, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXDQZrmk : 0, 8, 32, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXDQZrmk : 0, 8, 32, rebuildZExtCst}}; + return FixupConstant(Fixups, 512, 3); + } + case X86::VMOVAPDZrmkz: + case X86::VMOVUPDZrmkz: { + FixupEntry Fixups[] = { + {X86::VBROADCASTSDZrmkz, 1, 64, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXBQZrmkz : 0, 8, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBQZrmkz : 0, 8, 8, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXWQZrmkz : 0, 8, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWQZrmkz : 0, 8, 16, rebuildZExtCst}, + {X86::VBROADCASTF64X4Zrmkz, 1, 256, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXDQZrmkz : 0, 8, 32, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXDQZrmkz : 0, 8, 32, rebuildZExtCst}}; + return FixupConstant(Fixups, 512, 2); + } + case X86::VMOVAPSZrmk: + case X86::VMOVUPSZrmk: { + FixupEntry Fixups[] = { + {X86::VBROADCASTSSZrmk, 1, 32, rebuildSplatCst}, + {X86::VBROADCASTF32X4Zrmk, 1, 128, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXBDZrmk : 0, 16, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBDZrmk : 0, 16, 8, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXWDZrmk : 0, 16, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWDZrmk : 0, 16, 16, rebuildZExtCst}}; + return FixupConstant(Fixups, 512, 3); + } + case X86::VMOVAPSZrmkz: + case X86::VMOVUPSZrmkz: { + FixupEntry Fixups[] = { + {X86::VBROADCASTSSZrmkz, 1, 32, rebuildSplatCst}, + {X86::VBROADCASTF32X4Zrmkz, 1, 128, rebuildSplatCst}, + {MultiDomain ? X86::VPMOVSXBDZrmkz : 0, 16, 8, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXBDZrmkz : 0, 16, 8, rebuildZExtCst}, + {MultiDomain ? X86::VPMOVSXWDZrmkz : 0, 16, 16, rebuildSExtCst}, + {MultiDomain ? X86::VPMOVZXWDZrmkz : 0, 16, 16, rebuildZExtCst}}; + return FixupConstant(Fixups, 512, 2); } /* Integer Loads */ case X86::MOVDQArm: @@ -601,6 +743,42 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}}; return FixupConstant(Fixups, 128, 1); } + case X86::VMOVDQA32Z128rmk: + case X86::VMOVDQU32Z128rmk: + return FixupConstant({{X86::VPBROADCASTDZ128rmk, 1, 32, rebuildSplatCst}, + {X86::VPMOVSXBDZ128rmk, 4, 8, rebuildSExtCst}, + {X86::VPMOVZXBDZ128rmk, 4, 8, rebuildZExtCst}, + {X86::VPMOVSXWDZ128rmk, 4, 16, rebuildSExtCst}, + {X86::VPMOVZXWDZ128rmk, 4, 16, rebuildZExtCst}}, + 128, 3); + case X86::VMOVDQA32Z128rmkz: + case X86::VMOVDQU32Z128rmkz: + return FixupConstant({{X86::VPBROADCASTDZ128rmkz, 1, 32, rebuildSplatCst}, + {X86::VPMOVSXBDZ128rmkz, 4, 8, rebuildSExtCst}, + {X86::VPMOVZXBDZ128rmkz, 4, 8, rebuildZExtCst}, + {X86::VPMOVSXWDZ128rmkz, 4, 16, rebuildSExtCst}, + {X86::VPMOVZXWDZ128rmkz, 4, 16, rebuildZExtCst}}, + 128, 2); + case X86::VMOVDQA64Z128rmk: + case X86::VMOVDQU64Z128rmk: + return FixupConstant({{X86::VPMOVSXBQZ128rmk, 2, 8, rebuildSExtCst}, + {X86::VPMOVZXBQZ128rmk, 2, 8, rebuildZExtCst}, + {X86::VPMOVSXWQZ128rmk, 2, 16, rebuildSExtCst}, + {X86::VPMOVZXWQZ128rmk, 2, 16, rebuildZExtCst}, + {X86::VPBROADCASTQZ128rmk, 1, 64, rebuildSplatCst}, + {X86::VPMOVSXDQZ128rmk, 2, 32, rebuildSExtCst}, + {X86::VPMOVZXDQZ128rmk, 2, 32, rebuildZExtCst}}, + 128, 3); + case X86::VMOVDQA64Z128rmkz: + case X86::VMOVDQU64Z128rmkz: + return FixupConstant({{X86::VPMOVSXBQZ128rmkz, 2, 8, rebuildSExtCst}, + {X86::VPMOVZXBQZ128rmkz, 2, 8, rebuildZExtCst}, + {X86::VPMOVSXWQZ128rmkz, 2, 16, rebuildSExtCst}, + {X86::VPMOVZXWQZ128rmkz, 2, 16, rebuildZExtCst}, + {X86::VPBROADCASTQZ128rmkz, 1, 64, rebuildSplatCst}, + {X86::VPMOVSXDQZ128rmkz, 2, 32, rebuildSExtCst}, + {X86::VPMOVZXDQZ128rmkz, 2, 32, rebuildZExtCst}}, + 128, 2); case X86::VMOVDQA32Z256rm: case X86::VMOVDQA64Z256rm: case X86::VMOVDQU32Z256rm: @@ -625,6 +803,46 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}}; return FixupConstant(Fixups, 256, 1); } + case X86::VMOVDQA32Z256rmk: + case X86::VMOVDQU32Z256rmk: + return FixupConstant( + {{X86::VPBROADCASTDZ256rmk, 1, 32, rebuildSplatCst}, + {X86::VPMOVSXBDZ256rmk, 8, 8, rebuildSExtCst}, + {X86::VPMOVZXBDZ256rmk, 8, 8, rebuildZExtCst}, + {X86::VBROADCASTI32X4Z256rmk, 1, 128, rebuildSplatCst}, + {X86::VPMOVSXWDZ256rmk, 8, 16, rebuildSExtCst}, + {X86::VPMOVZXWDZ256rmk, 8, 16, rebuildZExtCst}}, + 256, 3); + case X86::VMOVDQA32Z256rmkz: + case X86::VMOVDQU32Z256rmkz: + return FixupConstant( + {{X86::VPBROADCASTDZ256rmkz, 1, 32, rebuildSplatCst}, + {X86::VPMOVSXBDZ256rmkz, 8, 8, rebuildSExtCst}, + {X86::VPMOVZXBDZ256rmkz, 8, 8, rebuildZExtCst}, + {X86::VBROADCASTI32X4Z256rmkz, 1, 128, rebuildSplatCst}, + {X86::VPMOVSXWDZ256rmkz, 8, 16, rebuildSExtCst}, + {X86::VPMOVZXWDZ256rmkz, 8, 16, rebuildZExtCst}}, + 256, 2); + case X86::VMOVDQA64Z256rmk: + case X86::VMOVDQU64Z256rmk: + return FixupConstant({{X86::VPMOVSXBQZ256rmk, 4, 8, rebuildSExtCst}, + {X86::VPMOVZXBQZ256rmk, 4, 8, rebuildZExtCst}, + {X86::VPBROADCASTQZ256rmk, 1, 64, rebuildSplatCst}, + {X86::VPMOVSXWQZ256rmk, 4, 16, rebuildSExtCst}, + {X86::VPMOVZXWQZ256rmk, 4, 16, rebuildZExtCst}, + {X86::VPMOVSXDQZ256rmk, 4, 32, rebuildSExtCst}, + {X86::VPMOVZXDQZ256rmk, 4, 32, rebuildZExtCst}}, + 256, 3); + case X86::VMOVDQA64Z256rmkz: + case X86::VMOVDQU64Z256rmkz: + return FixupConstant({{X86::VPMOVSXBQZ256rmkz, 4, 8, rebuildSExtCst}, + {X86::VPMOVZXBQZ256rmkz, 4, 8, rebuildZExtCst}, + {X86::VPBROADCASTQZ256rmkz, 1, 64, rebuildSplatCst}, + {X86::VPMOVSXWQZ256rmkz, 4, 16, rebuildSExtCst}, + {X86::VPMOVZXWQZ256rmkz, 4, 16, rebuildZExtCst}, + {X86::VPMOVSXDQZ256rmkz, 4, 32, rebuildSExtCst}, + {X86::VPMOVZXDQZ256rmkz, 4, 32, rebuildZExtCst}}, + 256, 2); case X86::VMOVDQA32Zrm: case X86::VMOVDQA64Zrm: case X86::VMOVDQU32Zrm: @@ -650,6 +868,46 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, {X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}}; return FixupConstant(Fixups, 512, 1); } + case X86::VMOVDQA32Zrmk: + case X86::VMOVDQU32Zrmk: + return FixupConstant({{X86::VPBROADCASTDZrmk, 1, 32, rebuildSplatCst}, + {X86::VBROADCASTI32X4Zrmk, 1, 128, rebuildSplatCst}, + {X86::VPMOVSXBDZrmk, 16, 8, rebuildSExtCst}, + {X86::VPMOVZXBDZrmk, 16, 8, rebuildZExtCst}, + {X86::VPMOVSXWDZrmk, 16, 16, rebuildSExtCst}, + {X86::VPMOVZXWDZrmk, 16, 16, rebuildZExtCst}}, + 512, 3); + case X86::VMOVDQA32Zrmkz: + case X86::VMOVDQU32Zrmkz: + return FixupConstant({{X86::VPBROADCASTDZrmkz, 1, 32, rebuildSplatCst}, + {X86::VBROADCASTI32X4Zrmkz, 1, 128, rebuildSplatCst}, + {X86::VPMOVSXBDZrmkz, 16, 8, rebuildSExtCst}, + {X86::VPMOVZXBDZrmkz, 16, 8, rebuildZExtCst}, + {X86::VPMOVSXWDZrmkz, 16, 16, rebuildSExtCst}, + {X86::VPMOVZXWDZrmkz, 16, 16, rebuildZExtCst}}, + 512, 2); + case X86::VMOVDQA64Zrmk: + case X86::VMOVDQU64Zrmk: + return FixupConstant({{X86::VPBROADCASTQZrmk, 1, 64, rebuildSplatCst}, + {X86::VPMOVSXBQZrmk, 8, 8, rebuildSExtCst}, + {X86::VPMOVZXBQZrmk, 8, 8, rebuildZExtCst}, + {X86::VPMOVSXWQZrmk, 8, 16, rebuildSExtCst}, + {X86::VPMOVZXWQZrmk, 8, 16, rebuildZExtCst}, + {X86::VBROADCASTI64X4Zrmk, 1, 256, rebuildSplatCst}, + {X86::VPMOVSXDQZrmk, 8, 32, rebuildSExtCst}, + {X86::VPMOVZXDQZrmk, 8, 32, rebuildZExtCst}}, + 512, 3); + case X86::VMOVDQA64Zrmkz: + case X86::VMOVDQU64Zrmkz: + return FixupConstant({{X86::VPBROADCASTQZrmkz, 1, 64, rebuildSplatCst}, + {X86::VPMOVSXBQZrmkz, 8, 8, rebuildSExtCst}, + {X86::VPMOVZXBQZrmkz, 8, 8, rebuildZExtCst}, + {X86::VPMOVSXWQZrmkz, 8, 16, rebuildSExtCst}, + {X86::VPMOVZXWQZrmkz, 8, 16, rebuildZExtCst}, + {X86::VBROADCASTI64X4Zrmkz, 1, 256, rebuildSplatCst}, + {X86::VPMOVSXDQZrmkz, 8, 32, rebuildSExtCst}, + {X86::VPMOVZXDQZrmkz, 8, 32, rebuildZExtCst}}, + 512, 2); } auto ConvertToBroadcast = [&](unsigned OpSrc, int BW) { @@ -670,7 +928,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, // Attempt to find a AVX512 mapping from a full width memory-fold instruction // to a broadcast-fold instruction variant. if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) - return ConvertToBroadcast(Opc, 32) || ConvertToBroadcast(Opc, 64); + return ConvertToBroadcast(Opc, 16) || ConvertToBroadcast(Opc, 32) || + ConvertToBroadcast(Opc, 64); // Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic // conversion to see if we can convert to a broadcasted (integer) logic op. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1bf33b5ed43d8..b548f82f69c09 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7618,6 +7618,21 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); + // When optimizing for size, generate up to 5 extra bytes for a broadcast + // instruction to save 8 or more bytes of constant pool data. + // TODO: If multiple splats are generated to load the same constant, + // it may be detrimental to overall size. There needs to be a way to detect + // that condition to know if this is truly a size win. + bool OptForSize = DAG.shouldOptForSize(); + + // On AVX512VL targets we're better off keeping the full width constant load + // and letting X86FixupVectorConstantsPass handle conversion to + // broadcast/broadcast-fold. + // AVX512 targets without AVX512VL can do this only for 512-bit vectors. + if (Subtarget.hasAVX512() && (Subtarget.hasVLX() || VT.is512BitVector()) && + BVOp->isConstant() && !OptForSize) + return SDValue(); + // See if the build vector is a repeating sequence of scalars (inc. splat). SDValue Ld; BitVector UndefElements; @@ -7743,12 +7758,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, unsigned ScalarSize = Ld.getValueSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); - // When optimizing for size, generate up to 5 extra bytes for a broadcast - // instruction to save 8 or more bytes of constant pool data. - // TODO: If multiple splats are generated to load the same constant, - // it may be detrimental to overall size. There needs to be a way to detect - // that condition to know if this is truly a size win. - bool OptForSize = DAG.shouldOptForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 1fada58f05ba9..08888f6f909b2 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1478,10 +1478,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1493,10 +1491,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -3235,10 +3231,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2)) -; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3252,10 +3246,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2)) -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) @@ -3762,8 +3754,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4015,8 +4006,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 61e122b1aba36..756ab8fa4ae74 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1205,10 +1205,9 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm0 ^ mem)) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -1216,10 +1215,9 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -2575,10 +2573,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm0 ^ ymm2)) -; AVX512F-NEXT: vpaddb (%rsi), %ymm3, %ymm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -2590,10 +2586,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm0 ^ ymm2)) -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm3, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll index c5243a5c18a2d..b8175abf59fb7 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1524,8 +1524,8 @@ define <2 x i64> @test_x86_avx2_psrlv_q_const() { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte @@ -1581,8 +1581,8 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256_const() { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,4,4,4] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,4] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 20550fc4eb9fa..00bbdea278074 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -303,7 +303,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; ; AVX512VL-LABEL: imulq128_bcast: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] +; AVX512VL-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086] ; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll index ba2cacc087b36..0452e74d5a03a 100644 --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -135,7 +135,7 @@ define void @bcast_unfold_add_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_add_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB4_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -167,7 +167,7 @@ define void @bcast_unfold_add_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_add_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB5_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -521,7 +521,7 @@ define void @bcast_unfold_or_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_or_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB16_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -553,8 +553,7 @@ define void @bcast_unfold_or_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_or_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3,3] -; CHECK-NEXT: # xmm0 = mem[0,0] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB17_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2474,7 +2473,7 @@ define void @bcast_unfold_smin_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smin_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB75_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2506,7 +2505,7 @@ define void @bcast_unfold_smin_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smin_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB76_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2670,7 +2669,7 @@ define void @bcast_unfold_smax_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smax_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB81_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2702,7 +2701,7 @@ define void @bcast_unfold_smax_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smax_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB82_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2866,7 +2865,7 @@ define void @bcast_unfold_umin_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umin_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB87_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2898,7 +2897,7 @@ define void @bcast_unfold_umin_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umin_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB88_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3062,7 +3061,7 @@ define void @bcast_unfold_umax_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umax_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB93_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3094,7 +3093,7 @@ define void @bcast_unfold_umax_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umax_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB94_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3264,13 +3263,13 @@ define void @bcast_unfold_pcmpgt_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB99_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 ; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 {%k1} = [3,3] ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB99_1 @@ -3298,13 +3297,13 @@ define void @bcast_unfold_pcmpgt_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB100_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 ; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 {%k1} = [3,3,3,3] ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB100_1 @@ -3472,13 +3471,13 @@ define void @bcast_unfold_pcmpeq_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB105_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 ; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 {%k1} = [3,3] ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB105_1 @@ -3506,13 +3505,13 @@ define void @bcast_unfold_pcmpeq_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB106_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 {%k1} = [3,3,3,3] ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB106_1 @@ -3683,13 +3682,13 @@ define void @bcast_unfold_pcmp_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmp_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB111_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 {%k1} = [3,3] ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF @@ -3718,13 +3717,13 @@ define void @bcast_unfold_pcmp_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmp_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB112_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 {%k1} = [3,3,3,3] ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF @@ -3897,13 +3896,13 @@ define void @bcast_unfold_pcmpu_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpu_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB117_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 {%k1} = [3,3] ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF @@ -3932,13 +3931,13 @@ define void @bcast_unfold_pcmpu_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpu_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB118_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 {%k1} = [3,3,3,3] ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF @@ -4323,13 +4322,13 @@ define void @bcast_unfold_ptestm_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_ptestm_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB129_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 {%k1} = [3,3,3,3] ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB129_1 @@ -4359,13 +4358,13 @@ define void @bcast_unfold_ptestnm_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_ptestnm_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB130_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 {%k1} = [3,3,3,3] ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB130_1 diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll b/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll index 96e20d0b8f967..a1f5bcf2eeabc 100644 --- a/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll @@ -15,7 +15,7 @@ define void @f_fu(ptr %ret, ptr %aa, float %b) { ; CHECK-NEXT: vpsrad $1, %zmm2, %zmm2 ; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 {%k1} = [u,3,u,5,u,7,u,9,u,11,u,13,u,15,u,17] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 {%k1} = [0,3,0,5,0,7,0,9,0,11,0,13,0,15,0,17] ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll index b264f5fc34688..fe9823f2b1424 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll @@ -297,8 +297,7 @@ entry: define half @fneg(half %x) { ; CHECK-LABEL: fneg: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %a = fneg half %x ret half %a @@ -307,8 +306,7 @@ define half @fneg(half %x) { define half @fneg_idiom(half %x) { ; CHECK-LABEL: fneg_idiom: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %a = fsub half -0.0, %x ret half %a @@ -317,8 +315,7 @@ define half @fneg_idiom(half %x) { define half @fabs(half %x) { ; CHECK-LABEL: fabs: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %a = call half @llvm.fabs.f16(half %x) ret half %a @@ -328,8 +325,7 @@ declare half @llvm.fabs.f16(half) define half @fcopysign(half %x, half %y) { ; CHECK-LABEL: fcopysign: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm2 & (xmm0 ^ xmm1)) +; CHECK-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1)) ; CHECK-NEXT: retq %a = call half @llvm.copysign.f16(half %x, half %y) ret half %a @@ -339,10 +335,9 @@ declare half @llvm.copysign.f16(half, half) define half @fround(half %x) { ; CHECK-LABEL: fround: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] -; CHECK-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | (xmm0 & xmm1) -; CHECK-NEXT: vaddsh %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; CHECK-NEXT: vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem) +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %a = call half @llvm.round.f16(half %x) @@ -353,8 +348,7 @@ declare half @llvm.round.f16(half) define <8 x half> @fnegv8f16(<8 x half> %x) { ; CHECK-LABEL: fnegv8f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %a = fneg <8 x half> %x ret <8 x half> %a @@ -363,8 +357,7 @@ define <8 x half> @fnegv8f16(<8 x half> %x) { define <8 x half> @fneg_idiomv8f16(<8 x half> %x) { ; CHECK-LABEL: fneg_idiomv8f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %a = fsub <8 x half> , %x ret <8 x half> %a @@ -373,8 +366,7 @@ define <8 x half> @fneg_idiomv8f16(<8 x half> %x) { define <8 x half> @fabsv8f16(<8 x half> %x) { ; CHECK-LABEL: fabsv8f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %a = call <8 x half> @llvm.fabs.v8f16(<8 x half> %x) ret <8 x half> %a @@ -394,10 +386,9 @@ declare <8 x half> @llvm.copysign.v8f16(<8 x half>, <8 x half>) define <8 x half> @roundv8f16(<8 x half> %x) { ; CHECK-LABEL: roundv8f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] -; CHECK-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | (xmm0 & xmm1) -; CHECK-NEXT: vaddph %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; CHECK-NEXT: vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem) +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vrndscaleph $11, %xmm0, %xmm0 ; CHECK-NEXT: retq %a = call <8 x half> @llvm.round.v8f16(<8 x half> %x) @@ -408,8 +399,7 @@ declare <8 x half> @llvm.round.v8f16(<8 x half>) define <16 x half> @fnegv16f16(<16 x half> %x) { ; CHECK-LABEL: fnegv16f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-NEXT: retq %a = fneg <16 x half> %x ret <16 x half> %a @@ -418,8 +408,7 @@ define <16 x half> @fnegv16f16(<16 x half> %x) { define <16 x half> @fneg_idiomv16f16(<16 x half> %x) { ; CHECK-LABEL: fneg_idiomv16f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-NEXT: retq %a = fsub <16 x half> , %x ret <16 x half> %a @@ -428,8 +417,7 @@ define <16 x half> @fneg_idiomv16f16(<16 x half> %x) { define <16 x half> @fabsv16f16(<16 x half> %x) { ; CHECK-LABEL: fabsv16f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-NEXT: retq %a = call <16 x half> @llvm.fabs.v16f16(<16 x half> %x) ret <16 x half> %a @@ -449,10 +437,9 @@ declare <16 x half> @llvm.copysign.v16f16(<16 x half>, <16 x half>) define <16 x half> @roundv16f16(<16 x half> %x) { ; CHECK-LABEL: roundv16f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] -; CHECK-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm1) -; CHECK-NEXT: vaddph %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; CHECK-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vrndscaleph $11, %ymm0, %ymm0 ; CHECK-NEXT: retq %a = call <16 x half> @llvm.round.v16f16(<16 x half> %x) @@ -463,8 +450,7 @@ declare <16 x half> @llvm.round.v16f16(<16 x half>) define <32 x half> @fnegv32f16(<32 x half> %x) { ; CHECK-LABEL: fnegv32f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; CHECK-NEXT: retq %a = fneg <32 x half> %x ret <32 x half> %a @@ -473,8 +459,7 @@ define <32 x half> @fnegv32f16(<32 x half> %x) { define <32 x half> @fneg_idiomv32f16(<32 x half> %x) { ; CHECK-LABEL: fneg_idiomv32f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; CHECK-NEXT: retq %a = fsub <32 x half> , %x ret <32 x half> %a @@ -483,8 +468,7 @@ define <32 x half> @fneg_idiomv32f16(<32 x half> %x) { define <32 x half> @fabsv32f16(<32 x half> %x) { ; CHECK-LABEL: fabsv32f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; CHECK-NEXT: retq %a = call <32 x half> @llvm.fabs.v32f16(<32 x half> %x) ret <32 x half> %a @@ -504,10 +488,9 @@ declare <32 x half> @llvm.copysign.v32f16(<32 x half>, <32 x half>) define <32 x half> @roundv32f16(<32 x half> %x) { ; CHECK-LABEL: roundv32f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] -; CHECK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & zmm1) -; CHECK-NEXT: vaddph %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vrndscaleph $11, %zmm0, %zmm0 ; CHECK-NEXT: retq %a = call <32 x half> @llvm.round.v32f16(<32 x half> %x) diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll index 7b142ea170c22..6558591d7a17d 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll @@ -69,8 +69,7 @@ entry: define dso_local <32 x half> @test5(<32 x half> noundef %a, <32 x half> noundef %b) local_unnamed_addr #0 { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxorq %zmm2, %zmm1, %zmm2 +; CHECK-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 ; CHECK-NEXT: vfmulcph %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll index f4c20b3b9b425..0051173103e1a 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll @@ -211,8 +211,7 @@ define half @movmsk(half %x) { define half @bitcast_fabs(half %x) { ; CHECK-LABEL: bitcast_fabs: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast half %x to i16 %and = and i16 %bc1, 32767 @@ -223,8 +222,7 @@ define half @bitcast_fabs(half %x) { define half @bitcast_fneg(half %x) { ; CHECK-LABEL: bitcast_fneg: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast half %x to i16 %xor = xor i16 %bc1, 32768 @@ -235,8 +233,7 @@ define half @bitcast_fneg(half %x) { define <8 x half> @bitcast_fabs_vec(<8 x half> %x) { ; CHECK-LABEL: bitcast_fabs_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <8 x half> %x to <8 x i16> %and = and <8 x i16> %bc1, @@ -247,8 +244,7 @@ define <8 x half> @bitcast_fabs_vec(<8 x half> %x) { define <8 x half> @bitcast_fneg_vec(<8 x half> %x) { ; CHECK-LABEL: bitcast_fneg_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <8 x half> %x to <8 x i16> %xor = xor <8 x i16> %bc1, @@ -285,8 +281,7 @@ define half @fsub_bitcast_fneg(half %x, half %y) { define half @nabs(half %a) { ; CHECK-LABEL: nabs: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %conv = bitcast half %a to i16 %and = or i16 %conv, -32768 @@ -297,8 +292,7 @@ define half @nabs(half %a) { define <8 x half> @nabsv8f16(<8 x half> %a) { ; CHECK-LABEL: nabsv8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: retq %conv = bitcast <8 x half> %a to <8 x i16> %and = or <8 x i16> %conv, @@ -357,7 +351,7 @@ define <8 x half> @fsub_bitcast_fneg_vec_undef_elts(<8 x half> %x, <8 x half> %y define <8 x half> @fadd_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { ; CHECK-LABEL: fadd_bitcast_fneg_vec_width: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 +; CHECK-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <8 x half> %y to <2 x i64> @@ -370,7 +364,7 @@ define <8 x half> @fadd_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { define <8 x half> @fsub_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { ; CHECK-LABEL: fsub_bitcast_fneg_vec_width: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 +; CHECK-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <8 x half> %y to <2 x i64> diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 86d7df0c2d648..b1b4e95c77741 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -60,18 +60,11 @@ define i1 @trunc_v2i64_cmp(<2 x i64> %a0) nounwind { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX12-LABEL: trunc_v2i64_cmp: -; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX12-NEXT: sete %al -; AVX12-NEXT: retq -; -; AVX512-LABEL: trunc_v2i64_cmp: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: retq +; AVX-LABEL: trunc_v2i64_cmp: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %1 = trunc <2 x i64> %a0 to <2 x i1> %2 = bitcast <2 x i1> %1 to i2 %3 = icmp eq i2 %2, 0 @@ -121,18 +114,11 @@ define i1 @trunc_v4i32_cmp(<4 x i32> %a0) nounwind { ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX12-LABEL: trunc_v4i32_cmp: -; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX12-NEXT: setb %al -; AVX12-NEXT: retq -; -; AVX512-LABEL: trunc_v4i32_cmp: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: retq +; AVX-LABEL: trunc_v4i32_cmp: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setb %al +; AVX-NEXT: retq %1 = trunc <4 x i32> %a0 to <4 x i1> %2 = bitcast <4 x i1> %1 to i4 %3 = icmp eq i4 %2, -1 @@ -195,18 +181,11 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX12-LABEL: trunc_v8i16_cmp: -; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX12-NEXT: setne %al -; AVX12-NEXT: retq -; -; AVX512-LABEL: trunc_v8i16_cmp: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [281479271743489,281479271743489] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; AVX-LABEL: trunc_v8i16_cmp: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setne %al +; AVX-NEXT: retq %1 = trunc <8 x i16> %a0 to <8 x i1> %2 = bitcast <8 x i1> %1 to i8 %3 = icmp ne i8 %2, 0 @@ -264,18 +243,11 @@ define i1 @trunc_v16i8_cmp(<16 x i8> %a0) nounwind { ; SSE41-NEXT: setae %al ; SSE41-NEXT: retq ; -; AVX12-LABEL: trunc_v16i8_cmp: -; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX12-NEXT: setae %al -; AVX12-NEXT: retq -; -; AVX512-LABEL: trunc_v16i8_cmp: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: retq +; AVX-LABEL: trunc_v16i8_cmp: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setae %al +; AVX-NEXT: retq %1 = trunc <16 x i8> %a0 to <16 x i1> %2 = bitcast <16 x i1> %1 to i16 %3 = icmp ne i16 %2, -1 @@ -350,8 +322,7 @@ define i1 @trunc_v4i64_cmp(<4 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_v4i64_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -426,8 +397,7 @@ define i1 @trunc_v8i132_cmp(<8 x i32> %a0) nounwind { ; ; AVX512-LABEL: trunc_v8i132_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX512-NEXT: setae %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -524,8 +494,7 @@ define i1 @trunc_v16i16_cmp(<16 x i16> %a0) nounwind { ; ; AVX512-LABEL: trunc_v16i16_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -616,8 +585,7 @@ define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind { ; ; AVX512-LABEL: trunc_v32i8_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX512-NEXT: setb %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index 2e237fb5b07b7..fd7bb24800b42 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -1451,8 +1451,7 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; ; AVX512-LABEL: f8xi64_i128: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl @@ -1485,8 +1484,7 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; ; AVX512F-64-LABEL: f8xi64_i128: ; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] -; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] ; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq @@ -1524,8 +1522,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; ; AVX512-LABEL: f8xi64_i256: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl @@ -1557,8 +1554,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; ; AVX512F-64-LABEL: f8xi64_i256: ; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3] -; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll index e5594dc9c5e3c..450d43febaf67 100644 --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -325,11 +325,22 @@ define <2 x i64> @and_or_v2i64(<2 x i64> %a0) { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,8] ; SSE-NEXT: retq ; -; AVX-LABEL: and_or_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [8,8] -; AVX-NEXT: # xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: and_or_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [8,8] +; AVX1-NEXT: # xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: and_or_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [8,8] +; AVX2-NEXT: # xmm0 = mem[0,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: and_or_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,8] +; AVX512-NEXT: retq %1 = or <2 x i64> %a0, %2 = and <2 x i64> %1, ret <2 x i64> %2 diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll index 25c26d598881a..3e3070939f231 100644 --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -1160,7 +1160,7 @@ define void @constantfold_andn_mask() nounwind { ; AVX512VL-NEXT: pushq %rax ; AVX512VL-NEXT: callq use@PLT ; AVX512VL-NEXT: vmovdqu (%rax), %xmm1 -; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,248,31,248,31,248,31,248,31,248,31,248,31,248,31,248] ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpavgb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll index 55b1cdeddb853..e5bc9f675fc3b 100644 --- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll +++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll @@ -633,8 +633,7 @@ define <4 x float> @test25(<4 x float> %a0) { ; ; AVX512-LABEL: test25: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3] ; AVX512-NEXT: retq %bc1 = bitcast <4 x float> %a0 to <4 x i32> %bc2 = bitcast <4 x float> to <4 x i32> @@ -807,7 +806,7 @@ define <2 x i64> @or_and_v2i64(<2 x i64> %a0) { ; ; AVX512-LABEL: or_and_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 & (xmm0 | mem) ; AVX512-NEXT: retq %1 = and <2 x i64> %a0, diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 70335f834291d..4f906e0ad87bb 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -277,8 +277,7 @@ define i32 @PR43159(ptr %a0) { ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 @@ -300,8 +299,7 @@ define i32 @PR43159(ptr %a0) { ; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX512DQVL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512DQVL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 1ae1d61091362..64f1700faeed3 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -960,8 +960,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] ; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 @@ -972,9 +971,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX512F-NEXT: vpsravd %zmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16: diff --git a/llvm/test/CodeGen/X86/divrem-by-select.ll b/llvm/test/CodeGen/X86/divrem-by-select.ll index f9582bb7343ba..7b0ac15b6a084 100644 --- a/llvm/test/CodeGen/X86/divrem-by-select.ll +++ b/llvm/test/CodeGen/X86/divrem-by-select.ll @@ -161,7 +161,7 @@ define <2 x i64> @udiv_indentity_zero(<2 x i1> %c, <2 x i64> %x) { ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k0 ; CHECK-X64-V4-NEXT: knotw %k0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k1} {z} = [1,1] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 {%k1} {z} = [1,1] ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx ; CHECK-X64-V4-NEXT: xorl %edx, %edx @@ -205,8 +205,8 @@ define <2 x i64> @udiv_indentity_partial_zero(<2 x i1> %c, <2 x i64> %x) { ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; CHECK-X64-V4-NEXT: vmovdqa64 {{.*#+}} xmm0 {%k1} = [0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 {%k1} = [0,5] ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: xorl %edx, %edx @@ -250,8 +250,8 @@ define <2 x i64> @urem_identity_const(<2 x i1> %c, <2 x i64> %x) { ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [11,11] -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k1} = [1,1] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 = [11,11] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 {%k1} = [1,1] ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: xorl %edx, %edx @@ -295,8 +295,8 @@ define <2 x i64> @sdiv_identity_const(<2 x i1> %c, <2 x i64> %x) { ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; CHECK-X64-V4-NEXT: vmovdqa64 {{.*#+}} xmm0 {%k1} = [11,13] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 {%k1} = [11,13] ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: cqto @@ -340,8 +340,8 @@ define <2 x i64> @sdiv_identity_const_todo_better_nonzero(<2 x i1> %c, <2 x i64> ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; CHECK-X64-V4-NEXT: vmovdqa64 {{.*#+}} xmm0 {%k1} = [11,17] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 {%k1} = [11,17] ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: cqto @@ -385,8 +385,8 @@ define <2 x i64> @srem_identity_const(<2 x i1> %c, <2 x i64> %x) { ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [11,11] -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k1} = [1,1] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 = [11,11] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 {%k1} = [1,1] ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: cqto @@ -435,8 +435,8 @@ define <2 x i64> @udivrem_identity_const(<2 x i1> %c, <2 x i64> %x) { ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k1} = [11,11] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 {%k1} = [11,11] ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: xorl %edx, %edx @@ -492,8 +492,8 @@ define <2 x i64> @sdivrem_identity_const(<2 x i1> %c, <2 x i64> %x) { ; CHECK-X64-V4: # %bb.0: ; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] -; CHECK-X64-V4-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k1} = [11,11] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] +; CHECK-X64-V4-NEXT: vpmovsxbq {{.*#+}} xmm0 {%k1} = [11,11] ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm0, %rcx ; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax ; CHECK-X64-V4-NEXT: cqto diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll index 95a7a10d50f59..e0d7d6249033c 100644 --- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll +++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll @@ -111,7 +111,7 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) { ; AVX512-NEXT: callq use.v4.i32@PLT ; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: vpcmpltud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 -; AVX512-NEXT: vmovdqa32 {{.*#+}} xmm0 {%k1} {z} = [4294967295,4294967295,4294967295,4294967295] +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 {%k1} {z} = [4294967295,4294967295,4294967295,4294967295] ; AVX512-NEXT: addq $24, %rsp ; AVX512-NEXT: .cfi_def_cfa_offset 8 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll index 018b6c2d20f1e..4f290acb81d24 100644 --- a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll @@ -257,7 +257,7 @@ define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) ; AVX512-LABEL: vp_fabs_v4f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, (%rdi) ; AVX512-NEXT: retq %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) store <4 x float> %res, ptr %out @@ -320,7 +320,7 @@ define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) ; AVX512-LABEL: vp_fneg_v4f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, (%rdi) ; AVX512-NEXT: retq %res = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) store <4 x float> %res, ptr %out diff --git a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll index 81ab104cab283..8f33f93af6367 100644 --- a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll +++ b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll @@ -6,8 +6,7 @@ define void @PR117684(i1 %cond, <8 x float> %vec, ptr %ptr1, ptr %ptr2) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmpnltps %ymm1, %ymm0, %k1 -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] -; CHECK-NEXT: vinsertf32x4 $0, %xmm0, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 {%k1} {z} = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 67c9e7cc22236..c3f0b8cb7c064 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -935,7 +935,7 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; CHECK-FMA-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: vcvtqq2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll index e98fb8e374c0b..635b3b7d806e2 100644 --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -60,10 +60,9 @@ define half @round_f16(half %h) { ; ; AVX512FP16-LABEL: round_f16: ; AVX512FP16: ## %bb.0: ## %entry -; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] -; AVX512FP16-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | (xmm0 & xmm1) -; AVX512FP16-NEXT: vaddsh %xmm2, %xmm0, %xmm0 +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; AVX512FP16-NEXT: vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem) +; AVX512FP16-NEXT: vaddsh %xmm1, %xmm0, %xmm0 ; AVX512FP16-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0 ; AVX512FP16-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index 967f26f70946a..dcf8c05d7d9f5 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -1647,7 +1647,7 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512VL-NEXT: vpor %ymm7, %ymm9, %ymm7 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; GFNIAVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,1,1,1] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11 diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll index 62466bfa98ec2..b440f515da387 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -154,8 +154,7 @@ define <4 x i64> @illegal_abs_to_eq_or_sext(<4 x i64> %x) { ; AVX512-LABEL: illegal_abs_to_eq_or_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpabsq %ymm0, %ymm0 -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX512-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: illegal_abs_to_eq_or_sext: @@ -279,8 +278,7 @@ define <4 x i64> @illegal_abs_to_ne_and_sext(<4 x i64> %x) { ; AVX512-LABEL: illegal_abs_to_ne_and_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpabsq %ymm0, %ymm0 -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX512-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpternlogq $15, %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: retq ; @@ -346,8 +344,7 @@ define <4 x i1> @legal_abs_eq_unchanged(<4 x i32> %x) { ; AVX512-LABEL: legal_abs_eq_unchanged: ; AVX512: # %bb.0: ; AVX512-NEXT: vpabsd %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: legal_abs_eq_unchanged: @@ -380,8 +377,7 @@ define <4 x i32> @legal_abs_eq_unchanged_sext(<4 x i32> %x) { ; AVX512-LABEL: legal_abs_eq_unchanged_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpabsd %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: legal_abs_eq_unchanged_sext: @@ -415,8 +411,7 @@ define <4 x i1> @legal_abs_ne_unchangedd(<4 x i32> %x) { ; AVX512-LABEL: legal_abs_ne_unchangedd: ; AVX512: # %bb.0: ; AVX512-NEXT: vpabsd %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -456,8 +451,7 @@ define <4 x i32> @legal_abs_ne_unchangedd_sext(<4 x i32> %x) { ; AVX512-LABEL: legal_abs_ne_unchangedd_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpabsd %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -558,9 +552,8 @@ define <4 x i1> @eq_or_to_abs_vec4x64(<4 x i64> %x) { define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) { ; AVX512-LABEL: eq_or_to_abs_vec4x64_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX512-NEXT: vpabsq %ymm0, %ymm0 -; AVX512-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: eq_or_to_abs_vec4x64_sext: @@ -693,9 +686,8 @@ define <4 x i1> @ne_and_to_abs_vec4x64(<4 x i64> %x) { define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { ; AVX512-LABEL: ne_and_to_abs_vec4x64_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX512-NEXT: vpabsq %ymm0, %ymm0 -; AVX512-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpternlogq $15, %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: retq ; @@ -768,9 +760,8 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { define <4 x i1> @eq_or_to_abs_vec4x32(<4 x i32> %x) { ; AVX512-LABEL: eq_or_to_abs_vec4x32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX512-NEXT: vpabsd %xmm0, %xmm0 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: eq_or_to_abs_vec4x32: @@ -803,9 +794,8 @@ define <4 x i1> @eq_or_to_abs_vec4x32(<4 x i32> %x) { define <4 x i32> @eq_or_to_abs_vec4x32_sext(<4 x i32> %x) { ; AVX512-LABEL: eq_or_to_abs_vec4x32_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX512-NEXT: vpabsd %xmm0, %xmm0 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: eq_or_to_abs_vec4x32_sext: @@ -839,9 +829,8 @@ define <4 x i32> @eq_or_to_abs_vec4x32_sext(<4 x i32> %x) { define <4 x i1> @ne_and_to_abs_vec4x32(<4 x i32> %x) { ; AVX512-LABEL: ne_and_to_abs_vec4x32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX512-NEXT: vpabsd %xmm0, %xmm0 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -880,9 +869,8 @@ define <4 x i1> @ne_and_to_abs_vec4x32(<4 x i32> %x) { define <4 x i32> @ne_and_to_abs_vec4x32_sext(<4 x i32> %x) { ; AVX512-LABEL: ne_and_to_abs_vec4x32_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX512-NEXT: vpabsd %xmm0, %xmm0 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll index af3b07bc131a9..05c04042cd38f 100644 --- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll @@ -130,7 +130,7 @@ define i64 @pow2_mask_v8i8(i8 zeroext %0) { ; AVX512-LABEL: pow2_mask_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpbroadcastb %edi, %xmm0 -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 46e589b7b1be9..694b3bf5773c4 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -407,9 +407,9 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X64-SKX-LARGE-NEXT: vpbroadcastq %rdi, %zmm2 ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 +; X64-SKX-LARGE-NEXT: vpmuldq (%rax), %zmm1, %zmm1 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpmullq (%rax), %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 @@ -491,9 +491,9 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X64-SKX-LARGE-NEXT: vpbroadcastq %rdi, %zmm2 ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 +; X64-SKX-LARGE-NEXT: vpmuldq (%rax), %zmm1, %zmm1 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpmullq (%rax), %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 @@ -3793,7 +3793,7 @@ define <16 x float> @zext_index(ptr %base, <16 x i32> %ind) { ; X64-SKX-LARGE-LABEL: zext_index: ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1 +; X64-SKX-LARGE-NEXT: vandps (%rax), %zmm0, %zmm1 ; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} @@ -4871,7 +4871,7 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 ; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpandd (%rax), %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero @@ -4961,7 +4961,7 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 ; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpandd (%rax), %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero @@ -5067,7 +5067,7 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 ; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpandd (%rax), %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 9b624a935bada..102405208771d 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -1120,24 +1120,11 @@ define i1 @allzeros_v16i8_and1(<16 x i8> %arg) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v16i8_and1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allzeros_v16i8_and1: -; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; KNL-NEXT: sete %al -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v16i8_and1: -; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; SKX-NEXT: vptest %xmm1, %xmm0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v16i8_and1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <16 x i8> %arg, %tmp1 = icmp ne <16 x i8> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -1230,13 +1217,20 @@ define i1 @allzeros_v32i8_and1(<32 x i8> %arg) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i8_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i8_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i8_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i8> %arg, %tmp1 = icmp ne <32 x i8> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -1416,24 +1410,11 @@ define i1 @allzeros_v8i16_and1(<8 x i16> %arg) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v8i16_and1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allzeros_v8i16_and1: -; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; KNL-NEXT: sete %al -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v8i16_and1: -; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [281479271743489,281479271743489] -; SKX-NEXT: vptest %xmm1, %xmm0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v8i16_and1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <8 x i16> %arg, %tmp1 = icmp ne <8 x i16> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -1656,13 +1637,20 @@ define i1 @allzeros_v16i16_and1(<16 x i16> %arg) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v16i16_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v16i16_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v16i16_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <16 x i16> %arg, %tmp1 = icmp ne <16 x i16> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -1726,24 +1714,11 @@ define i1 @allzeros_v4i32_and1(<4 x i32> %arg) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v4i32_and1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allzeros_v4i32_and1: -; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; KNL-NEXT: sete %al -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v4i32_and1: -; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; SKX-NEXT: vptest %xmm1, %xmm0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v4i32_and1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <4 x i32> %arg, %tmp1 = icmp ne <4 x i32> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -1839,13 +1814,20 @@ define i1 @allzeros_v8i32_and1(<8 x i32> %arg) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v8i32_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v8i32_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v8i32_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <8 x i32> %arg, %tmp1 = icmp ne <8 x i32> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -2021,24 +2003,11 @@ define i1 @allzeros_v2i64_and1(<2 x i64> %arg) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v2i64_and1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allzeros_v2i64_and1: -; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; KNL-NEXT: sete %al -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v2i64_and1: -; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; SKX-NEXT: vptest %xmm1, %xmm0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v2i64_and1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <2 x i64> %arg, %tmp1 = icmp ne <2 x i64> %tmp, zeroinitializer %tmp2 = bitcast <2 x i1> %tmp1 to i2 @@ -2134,13 +2103,20 @@ define i1 @allzeros_v4i64_and1(<4 x i64> %arg) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v4i64_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v4i64_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v4i64_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <4 x i64> %arg, %tmp1 = icmp ne <4 x i64> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -2255,8 +2231,7 @@ define i1 @allzeros_v8i64_and1(<8 x i64> %arg) { ; ; AVX512-LABEL: allzeros_v8i64_and1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] -; AVX512-NEXT: vptestmd %zmm1, %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper @@ -2321,24 +2296,11 @@ define i1 @allzeros_v16i8_and4(<16 x i8> %arg) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v16i8_and4: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allzeros_v16i8_and4: -; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; KNL-NEXT: sete %al -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v16i8_and4: -; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [289360691352306692,289360691352306692] -; SKX-NEXT: vptest %xmm1, %xmm0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v16i8_and4: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <16 x i8> %arg, %tmp1 = icmp ne <16 x i8> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -2431,13 +2393,20 @@ define i1 @allzeros_v32i8_and4(<32 x i8> %arg) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i8_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i8_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i8_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i8> %arg, %tmp1 = icmp ne <32 x i8> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -2617,24 +2586,11 @@ define i1 @allzeros_v8i16_and4(<8 x i16> %arg) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v8i16_and4: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allzeros_v8i16_and4: -; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; KNL-NEXT: sete %al -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v8i16_and4: -; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1125917086973956,1125917086973956] -; SKX-NEXT: vptest %xmm1, %xmm0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v8i16_and4: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <8 x i16> %arg, %tmp1 = icmp ne <8 x i16> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -2857,13 +2813,20 @@ define i1 @allzeros_v16i16_and4(<16 x i16> %arg) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v16i16_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v16i16_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v16i16_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <16 x i16> %arg, %tmp1 = icmp ne <16 x i16> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -2927,24 +2890,11 @@ define i1 @allzeros_v4i32_and4(<4 x i32> %arg) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v4i32_and4: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allzeros_v4i32_and4: -; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; KNL-NEXT: sete %al -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v4i32_and4: -; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17179869188,17179869188] -; SKX-NEXT: vptest %xmm1, %xmm0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v4i32_and4: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <4 x i32> %arg, %tmp1 = icmp ne <4 x i32> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -3040,13 +2990,20 @@ define i1 @allzeros_v8i32_and4(<8 x i32> %arg) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v8i32_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869188,17179869188,17179869188,17179869188] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v8i32_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869188,17179869188,17179869188,17179869188] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v8i32_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <8 x i32> %arg, %tmp1 = icmp ne <8 x i32> %tmp, zeroinitializer %tmp2 = bitcast <8 x i1> %tmp1 to i8 @@ -3222,24 +3179,11 @@ define i1 @allzeros_v2i64_and4(<2 x i64> %arg) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: allzeros_v2i64_and4: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; KNL-LABEL: allzeros_v2i64_and4: -; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; KNL-NEXT: sete %al -; KNL-NEXT: retq -; -; SKX-LABEL: allzeros_v2i64_and4: -; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] -; SKX-NEXT: vptest %xmm1, %xmm0 -; SKX-NEXT: sete %al -; SKX-NEXT: retq +; AVX-LABEL: allzeros_v2i64_and4: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %tmp = and <2 x i64> %arg, %tmp1 = icmp ne <2 x i64> %tmp, zeroinitializer %tmp2 = bitcast <2 x i1> %tmp1 to i2 @@ -3335,13 +3279,20 @@ define i1 @allzeros_v4i64_and4(<4 x i64> %arg) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v4i64_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v4i64_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v4i64_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <4 x i64> %arg, %tmp1 = icmp ne <4 x i64> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -3456,8 +3407,7 @@ define i1 @allzeros_v8i64_and4(<8 x i64> %arg) { ; ; AVX512-LABEL: allzeros_v8i64_and4: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4] -; AVX512-NEXT: vptestmd %zmm1, %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/pr114520.ll b/llvm/test/CodeGen/X86/pr114520.ll index 9bd1f49ff67c9..507c36bf2126d 100644 --- a/llvm/test/CodeGen/X86/pr114520.ll +++ b/llvm/test/CodeGen/X86/pr114520.ll @@ -22,11 +22,10 @@ define <8 x half> @test2(<8 x half> %x) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vcvtph2ps %xmm0, %ymm1 ; CHECK-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovdw %ymm2, %xmm2 -; CHECK-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm2 & (xmm0 ^ xmm1)) +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z} +; CHECK-NEXT: vpmovdw %ymm1, %xmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} xmm0 = mem ^ (xmm1 & (xmm0 ^ mem)) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll index cc7bfb58c329b..be87e95f2d522 100644 --- a/llvm/test/CodeGen/X86/shuffle-half.ll +++ b/llvm/test/CodeGen/X86/shuffle-half.ll @@ -31,8 +31,7 @@ define <32 x half> @build_vec(ptr %p, <32 x i1> %mask) { ; CHECK-NEXT: jne .LBB1_4 ; CHECK-NEXT: jmp .LBB1_5 ; CHECK-NEXT: .LBB1_1: -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] -; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: testb $2, %al ; CHECK-NEXT: je .LBB1_5 ; CHECK-NEXT: .LBB1_4: # %cond.load1 diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 84ae818d91832..92bdbbf28b9bf 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1333,10 +1333,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX512VL-LABEL: negative: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm1 & ~ymm2) | ymm0 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,3,2,3] +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm1 & ~mem) +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll index cf5f527b16114..5ab6d84a63f20 100644 --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -401,26 +401,15 @@ define void @test12() nounwind { ; SSE-NEXT: movaps %xmm2, 0 ; SSE-NEXT: ret{{[l|q]}} ; -; AVX1-LABEL: test12: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 0, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovaps %xmm0, 0 -; AVX1-NEXT: ret{{[l|q]}} -; -; AVX512-LABEL: test12: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps 0, %xmm0 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, 0 -; AVX512-NEXT: ret{{[l|q]}} +; AVX-LABEL: test12: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps 0, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovaps %xmm0, 0 +; AVX-NEXT: ret{{[l|q]}} %tmp1 = load <4 x float>, ptr null ; <<4 x float>> [#uses=2] %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] @@ -701,6 +690,8 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { ret <4 x i32> %m } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1: {{.*}} +; AVX512: {{.*}} ; X64-AVX1: {{.*}} ; X64-AVX512: {{.*}} ; X86-AVX1: {{.*}} diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 76183ac5f8fa3..89356cf5335aa 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -841,8 +841,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; ; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X86-AVX512: # %bb.0: # %entry -; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0] -; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4] ; X86-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; X86-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; X86-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 @@ -891,8 +890,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4] -; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4] ; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll index 01f10372eaa2d..858649afcf69a 100644 --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -152,7 +152,7 @@ define void @neg_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp { ; X86-AVX512-NEXT: vmovups (%ecx), %ymm0 ; X86-AVX512-NEXT: vcmpnltps (%eax), %ymm0, %ymm0 ; X86-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 -; X86-AVX512-NEXT: vmovdqa %ymm0, (%eax) +; X86-AVX512-NEXT: vmovaps %ymm0, (%eax) ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl ; @@ -161,7 +161,7 @@ define void @neg_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp { ; X64-AVX512-NEXT: vmovups (%rsi), %ymm0 ; X64-AVX512-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; X64-AVX512-NEXT: vmovdqa %ymm0, (%rax) +; X64-AVX512-NEXT: vmovaps %ymm0, (%rax) ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq %v0 = load <8 x float>, ptr %a, align 16 diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index 7296cc27894c3..928b33fba89df 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -1998,7 +1998,7 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm2, %k1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = [34,68,102,136] -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} ymm0 {%k1} = [17,51,85,119] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 {%k1} = [17,51,85,119] ; AVX512VL-NEXT: movq %rbp, %rsp ; AVX512VL-NEXT: popq %rbp ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll index 7c788d291a5c7..d2caeda248356 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -910,14 +910,14 @@ define <2 x double> @uitofp_v2i1_v2f64(<2 x i1> %x) #0 { ; ; AVX512VL-32-LABEL: uitofp_v2i1_v2f64: ; AVX512VL-32: # %bb.0: -; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VL-32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512VL-32-NEXT: retl ; ; AVX512VL-64-LABEL: uitofp_v2i1_v2f64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VL-64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512VL-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll index d0abd7d5f7512..a47b48b8193f1 100644 --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -160,12 +160,26 @@ define <8 x half> @fabs_v8f16(ptr %p) nounwind { ; X86-AVX2-NEXT: vpand (%eax), %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: fabs_v8f16: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512-NEXT: vpand (%eax), %xmm0, %xmm0 -; X86-AVX512-NEXT: retl +; X86-AVX512VL-LABEL: fabs_v8f16: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VL-NEXT: vmovaps (%eax), %xmm0 +; X86-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512FP16-LABEL: fabs_v8f16: +; X86-AVX512FP16: # %bb.0: +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512FP16-NEXT: vmovaps (%eax), %xmm0 +; X86-AVX512FP16-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-AVX512FP16-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fabs_v8f16: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VLDQ-NEXT: vmovaps (%eax), %xmm0 +; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fabs_v8f16: ; X64-SSE: # %bb.0: @@ -185,11 +199,23 @@ define <8 x half> @fabs_v8f16(ptr %p) nounwind { ; X64-AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; -; X64-AVX512-LABEL: fabs_v8f16: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX512-NEXT: vpand (%rdi), %xmm0, %xmm0 -; X64-AVX512-NEXT: retq +; X64-AVX512VL-LABEL: fabs_v8f16: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512FP16-LABEL: fabs_v8f16: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX512FP16-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512FP16-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fabs_v8f16: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512VLDQ-NEXT: retq %v = load <8 x half>, ptr %p, align 16 %nnv = call <8 x half> @llvm.fabs.v8f16(<8 x half> %v) ret <8 x half> %nnv @@ -366,12 +392,26 @@ define <16 x half> @fabs_v16f16(ptr %p) nounwind { ; X86-AVX2-NEXT: vpand (%eax), %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: fabs_v16f16: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512-NEXT: vpand (%eax), %ymm0, %ymm0 -; X86-AVX512-NEXT: retl +; X86-AVX512VL-LABEL: fabs_v16f16: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VL-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512FP16-LABEL: fabs_v16f16: +; X86-AVX512FP16: # %bb.0: +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512FP16-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX512FP16-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-AVX512FP16-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fabs_v16f16: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VLDQ-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fabs_v16f16: ; X64-SSE: # %bb.0: @@ -393,11 +433,23 @@ define <16 x half> @fabs_v16f16(ptr %p) nounwind { ; X64-AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; -; X64-AVX512-LABEL: fabs_v16f16: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX512-NEXT: vpand (%rdi), %ymm0, %ymm0 -; X64-AVX512-NEXT: retq +; X64-AVX512VL-LABEL: fabs_v16f16: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512FP16-LABEL: fabs_v16f16: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX512FP16-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-AVX512FP16-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fabs_v16f16: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-AVX512VLDQ-NEXT: retq %v = load <16 x half>, ptr %p, align 32 %nnv = call <16 x half> @llvm.fabs.v16f16(<16 x half> %v) ret <16 x half> %nnv @@ -587,24 +639,22 @@ define <32 x half> @fabs_v32f16(ptr %p) nounwind { ; X86-AVX512VL-LABEL: fabs_v32f16: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X86-AVX512VL-NEXT: vpandq (%eax), %zmm0, %zmm0 +; X86-AVX512VL-NEXT: vmovdqa64 (%eax), %zmm0 +; X86-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 ; X86-AVX512VL-NEXT: retl ; ; X86-AVX512FP16-LABEL: fabs_v32f16: ; X86-AVX512FP16: # %bb.0: ; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512FP16-NEXT: vpandq (%eax), %zmm0, %zmm0 +; X86-AVX512FP16-NEXT: vmovdqa64 (%eax), %zmm0 +; X86-AVX512FP16-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 ; X86-AVX512FP16-NEXT: retl ; ; X86-AVX512VLDQ-LABEL: fabs_v32f16: ; X86-AVX512VLDQ: # %bb.0: ; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX512VLDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X86-AVX512VLDQ-NEXT: vpandq (%eax), %zmm0, %zmm0 +; X86-AVX512VLDQ-NEXT: vmovaps (%eax), %zmm0 +; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 ; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fabs_v32f16: @@ -635,22 +685,20 @@ define <32 x half> @fabs_v32f16(ptr %p) nounwind { ; ; X64-AVX512VL-LABEL: fabs_v32f16: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512VL-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 +; X64-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512FP16-LABEL: fabs_v32f16: ; X64-AVX512FP16: # %bb.0: -; X64-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX512FP16-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; X64-AVX512FP16-NEXT: vmovdqa64 (%rdi), %zmm0 +; X64-AVX512FP16-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-AVX512FP16-NEXT: retq ; ; X64-AVX512VLDQ-LABEL: fabs_v32f16: ; X64-AVX512VLDQ: # %bb.0: -; X64-AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX512VLDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512VLDQ-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; X64-AVX512VLDQ-NEXT: vmovaps (%rdi), %zmm0 +; X64-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-AVX512VLDQ-NEXT: retq %v = load <32 x half>, ptr %p, align 64 %nnv = call <32 x half> @llvm.fabs.v32f16(<32 x half> %v) @@ -766,3 +814,6 @@ define void @PR70947(ptr %src, ptr %dst) nounwind { store <2 x double> %fabs4, ptr %dst4, align 4 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64-AVX512: {{.*}} +; X86-AVX512: {{.*}} diff --git a/llvm/test/CodeGen/X86/vec_fcopysign.ll b/llvm/test/CodeGen/X86/vec_fcopysign.ll index 5b9cda58bac20..eb968cfdf7114 100644 --- a/llvm/test/CodeGen/X86/vec_fcopysign.ll +++ b/llvm/test/CodeGen/X86/vec_fcopysign.ll @@ -154,14 +154,32 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: fcopysign_v8f16: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm1 -; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %xmm1, %xmm0 -; X86-AVX512-NEXT: retl +; X86-AVX512VL-LABEL: fcopysign_v8f16: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512VL-NEXT: vmovdqa (%ecx), %xmm1 +; X86-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] +; X86-AVX512VL-NEXT: vpternlogd $202, (%eax), %xmm1, %xmm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512FP16-LABEL: fcopysign_v8f16: +; X86-AVX512FP16: # %bb.0: +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512FP16-NEXT: vmovdqa (%ecx), %xmm1 +; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm0 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX512FP16-NEXT: vpternlogd $202, (%eax), %xmm1, %xmm0 +; X86-AVX512FP16-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fcopysign_v8f16: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512VLDQ-NEXT: vmovdqa (%ecx), %xmm1 +; X86-AVX512VLDQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] +; X86-AVX512VLDQ-NEXT: vpternlogd $202, (%eax), %xmm1, %xmm0 +; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v8f16: ; X64-SSE: # %bb.0: @@ -190,12 +208,26 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: retq ; -; X64-AVX512-LABEL: fcopysign_v8f16: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %xmm1, %xmm0 -; X64-AVX512-NEXT: retq +; X64-AVX512VL-LABEL: fcopysign_v8f16: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 +; X64-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] +; X64-AVX512VL-NEXT: vpternlogd $202, (%rsi), %xmm1, %xmm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512FP16-LABEL: fcopysign_v8f16: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vmovdqa (%rdi), %xmm1 +; X64-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm0 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512FP16-NEXT: vpternlogd $202, (%rsi), %xmm1, %xmm0 +; X64-AVX512FP16-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fcopysign_v8f16: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vmovdqa (%rdi), %xmm1 +; X64-AVX512VLDQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] +; X64-AVX512VLDQ-NEXT: vpternlogd $202, (%rsi), %xmm1, %xmm0 +; X64-AVX512VLDQ-NEXT: retq %a0 = load <8 x half>, ptr %p0, align 16 %a1 = load <8 x half>, ptr %p1, align 16 %t = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a0, <8 x half> %a1) @@ -400,14 +432,32 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: fcopysign_v16f16: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovdqu (%ecx), %ymm1 -; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %ymm1, %ymm0 -; X86-AVX512-NEXT: retl +; X86-AVX512VL-LABEL: fcopysign_v16f16: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512VL-NEXT: vmovdqu (%ecx), %ymm1 +; X86-AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X86-AVX512VL-NEXT: vpternlogd $202, (%eax), %ymm1, %ymm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512FP16-LABEL: fcopysign_v16f16: +; X86-AVX512FP16: # %bb.0: +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512FP16-NEXT: vmovdqu (%ecx), %ymm1 +; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX512FP16-NEXT: vpternlogd $202, (%eax), %ymm1, %ymm0 +; X86-AVX512FP16-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fcopysign_v16f16: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512VLDQ-NEXT: vmovdqu (%ecx), %ymm1 +; X86-AVX512VLDQ-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X86-AVX512VLDQ-NEXT: vpternlogd $202, (%eax), %ymm1, %ymm0 +; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v16f16: ; X64-SSE: # %bb.0: @@ -441,12 +491,26 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; -; X64-AVX512-LABEL: fcopysign_v16f16: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 -; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %ymm1, %ymm0 -; X64-AVX512-NEXT: retq +; X64-AVX512VL-LABEL: fcopysign_v16f16: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X64-AVX512VL-NEXT: vpternlogd $202, (%rsi), %ymm1, %ymm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512FP16-LABEL: fcopysign_v16f16: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512FP16-NEXT: vpternlogd $202, (%rsi), %ymm1, %ymm0 +; X64-AVX512FP16-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fcopysign_v16f16: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX512VLDQ-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X64-AVX512VLDQ-NEXT: vpternlogd $202, (%rsi), %ymm1, %ymm0 +; X64-AVX512VLDQ-NEXT: retq %a0 = load <16 x half>, ptr %p0, align 16 %a1 = load <16 x half>, ptr %p1, align 16 %t = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a0, <16 x half> %a1) @@ -691,14 +755,32 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: fcopysign_v32f16: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovdqu64 (%ecx), %zmm1 -; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %zmm1, %zmm0 -; X86-AVX512-NEXT: retl +; X86-AVX512VL-LABEL: fcopysign_v32f16: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512VL-NEXT: vmovdqu64 (%ecx), %zmm1 +; X86-AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X86-AVX512VL-NEXT: vpternlogd $202, (%eax), %zmm1, %zmm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512FP16-LABEL: fcopysign_v32f16: +; X86-AVX512FP16: # %bb.0: +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512FP16-NEXT: vmovdqu64 (%ecx), %zmm1 +; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX512FP16-NEXT: vpternlogd $202, (%eax), %zmm1, %zmm0 +; X86-AVX512FP16-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fcopysign_v32f16: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512VLDQ-NEXT: vmovdqu64 (%ecx), %zmm1 +; X86-AVX512VLDQ-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X86-AVX512VLDQ-NEXT: vpternlogd $202, (%eax), %zmm1, %zmm0 +; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v32f16: ; X64-SSE: # %bb.0: @@ -746,12 +828,26 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: retq ; -; X64-AVX512-LABEL: fcopysign_v32f16: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 -; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %zmm1, %zmm0 -; X64-AVX512-NEXT: retq +; X64-AVX512VL-LABEL: fcopysign_v32f16: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm1 +; X64-AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X64-AVX512VL-NEXT: vpternlogd $202, (%rsi), %zmm1, %zmm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512FP16-LABEL: fcopysign_v32f16: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vmovdqu64 (%rdi), %zmm1 +; X64-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512FP16-NEXT: vpternlogd $202, (%rsi), %zmm1, %zmm0 +; X64-AVX512FP16-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fcopysign_v32f16: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %zmm1 +; X64-AVX512VLDQ-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] +; X64-AVX512VLDQ-NEXT: vpternlogd $202, (%rsi), %zmm1, %zmm0 +; X64-AVX512VLDQ-NEXT: retq %a0 = load <32 x half>, ptr %p0, align 16 %a1 = load <32 x half>, ptr %p1, align 16 %t = call <32 x half> @llvm.copysign.v32f16(<32 x half> %a0, <32 x half> %a1) @@ -761,11 +857,5 @@ declare <32 x half> @llvm.copysign.v32f16(<32 x half>, <32 x half>) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; X64: {{.*}} ; X64-AVX: {{.*}} -; X64-AVX512FP16: {{.*}} -; X64-AVX512VL: {{.*}} -; X64-AVX512VLDQ: {{.*}} ; X86: {{.*}} ; X86-AVX: {{.*}} -; X86-AVX512FP16: {{.*}} -; X86-AVX512VL: {{.*}} -; X86-AVX512VLDQ: {{.*}} diff --git a/llvm/test/CodeGen/X86/vec_fneg.ll b/llvm/test/CodeGen/X86/vec_fneg.ll index 64204a5c2123f..035e63badd0a4 100644 --- a/llvm/test/CodeGen/X86/vec_fneg.ll +++ b/llvm/test/CodeGen/X86/vec_fneg.ll @@ -158,12 +158,26 @@ define <8 x half> @fneg_v8f16(ptr %p) nounwind { ; X86-AVX2-NEXT: vpxor (%eax), %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: fneg_v8f16: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-AVX512-NEXT: vpxor (%eax), %xmm0, %xmm0 -; X86-AVX512-NEXT: retl +; X86-AVX512VL-LABEL: fneg_v8f16: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VL-NEXT: vmovaps (%eax), %xmm0 +; X86-AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512FP16-LABEL: fneg_v8f16: +; X86-AVX512FP16: # %bb.0: +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512FP16-NEXT: vmovaps (%eax), %xmm0 +; X86-AVX512FP16-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-AVX512FP16-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fneg_v8f16: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VLDQ-NEXT: vmovaps (%eax), %xmm0 +; X86-AVX512VLDQ-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fneg_v8f16: ; X64-SSE: # %bb.0: @@ -183,11 +197,23 @@ define <8 x half> @fneg_v8f16(ptr %p) nounwind { ; X64-AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; -; X64-AVX512-LABEL: fneg_v8f16: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; X64-AVX512-NEXT: retq +; X64-AVX512VL-LABEL: fneg_v8f16: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512FP16-LABEL: fneg_v8f16: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX512FP16-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512FP16-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fneg_v8f16: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX512VLDQ-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512VLDQ-NEXT: retq %v = load <8 x half>, ptr %p, align 16 %nnv = fsub <8 x half> , %v ret <8 x half> %nnv @@ -361,12 +387,26 @@ define <16 x half> @fneg_v16f16(ptr %p) nounwind { ; X86-AVX2-NEXT: vpxor (%eax), %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: fneg_v16f16: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-AVX512-NEXT: vpxor (%eax), %ymm0, %ymm0 -; X86-AVX512-NEXT: retl +; X86-AVX512VL-LABEL: fneg_v16f16: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VL-NEXT: vmovups (%eax), %ymm0 +; X86-AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512FP16-LABEL: fneg_v16f16: +; X86-AVX512FP16: # %bb.0: +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512FP16-NEXT: vmovups (%eax), %ymm0 +; X86-AVX512FP16-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-AVX512FP16-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fneg_v16f16: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VLDQ-NEXT: vmovups (%eax), %ymm0 +; X86-AVX512VLDQ-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fneg_v16f16: ; X64-SSE: # %bb.0: @@ -388,11 +428,23 @@ define <16 x half> @fneg_v16f16(ptr %p) nounwind { ; X64-AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; -; X64-AVX512-LABEL: fneg_v16f16: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; X64-AVX512-NEXT: retq +; X64-AVX512VL-LABEL: fneg_v16f16: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512FP16-LABEL: fneg_v16f16: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX512FP16-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-AVX512FP16-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fneg_v16f16: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX512VLDQ-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-AVX512VLDQ-NEXT: retq %v = load <16 x half>, ptr %p, align 16 %nnv = fsub <16 x half> , %v ret <16 x half> %nnv @@ -579,24 +631,22 @@ define <32 x half> @fneg_v32f16(ptr %p) nounwind { ; X86-AVX512VL-LABEL: fneg_v32f16: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X86-AVX512VL-NEXT: vpxorq (%eax), %zmm0, %zmm0 +; X86-AVX512VL-NEXT: vmovdqu64 (%eax), %zmm0 +; X86-AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 ; X86-AVX512VL-NEXT: retl ; ; X86-AVX512FP16-LABEL: fneg_v32f16: ; X86-AVX512FP16: # %bb.0: ; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-AVX512FP16-NEXT: vpxorq (%eax), %zmm0, %zmm0 +; X86-AVX512FP16-NEXT: vmovdqu64 (%eax), %zmm0 +; X86-AVX512FP16-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 ; X86-AVX512FP16-NEXT: retl ; ; X86-AVX512VLDQ-LABEL: fneg_v32f16: ; X86-AVX512VLDQ: # %bb.0: ; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-AVX512VLDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X86-AVX512VLDQ-NEXT: vpxorq (%eax), %zmm0, %zmm0 +; X86-AVX512VLDQ-NEXT: vmovups (%eax), %zmm0 +; X86-AVX512VLDQ-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 ; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fneg_v32f16: @@ -627,22 +677,20 @@ define <32 x half> @fneg_v32f16(ptr %p) nounwind { ; ; X64-AVX512VL-LABEL: fneg_v32f16: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512VL-NEXT: vpxorq (%rdi), %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512FP16-LABEL: fneg_v32f16: ; X64-AVX512FP16: # %bb.0: -; X64-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-AVX512FP16-NEXT: vpxorq (%rdi), %zmm0, %zmm0 +; X64-AVX512FP16-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512FP16-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-AVX512FP16-NEXT: retq ; ; X64-AVX512VLDQ-LABEL: fneg_v32f16: ; X64-AVX512VLDQ: # %bb.0: -; X64-AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-AVX512VLDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512VLDQ-NEXT: vpxorq (%rdi), %zmm0, %zmm0 +; X64-AVX512VLDQ-NEXT: vmovups (%rdi), %zmm0 +; X64-AVX512VLDQ-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-AVX512VLDQ-NEXT: retq %v = load <32 x half>, ptr %p, align 16 %nnv = fsub <32 x half> , %v @@ -650,4 +698,6 @@ define <32 x half> @fneg_v32f16(ptr %p) nounwind { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; X64: {{.*}} +; X64-AVX512: {{.*}} ; X86: {{.*}} +; X86-AVX512: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index fd0525e6d56a2..98dc9bb898be5 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -108,7 +108,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VL-LABEL: var_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -140,7 +140,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -1016,7 +1016,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1048,7 +1048,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index fdd0d68b89003..239ea9d5a2d9e 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -78,7 +78,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; AVX512VL-LABEL: var_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm3 = [63,63,63,63] ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 @@ -109,7 +109,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [63,63,63,63] ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 @@ -800,7 +800,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -831,7 +831,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -1038,7 +1038,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1069,7 +1069,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 1d807fa85ddc5..b863de0c378ae 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -437,7 +437,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -465,7 +465,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -562,7 +562,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm5 @@ -597,7 +597,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 9e872cc6d74a9..2288746d715a1 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -791,7 +791,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -813,7 +813,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 436fbe31f7a34..67c53f8f96809 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -317,7 +317,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5 @@ -345,7 +345,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index 322ebe22671e6..803c79eeddc84 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -368,14 +368,13 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; ; AVX512VL-LABEL: splatvar_funnnel_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm3 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] -; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] +; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VL-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -394,14 +393,13 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm3 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] -; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] +; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VLBW-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index a56b0a6351a3b..4468a7abb466e 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -108,7 +108,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VL-LABEL: var_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -140,7 +140,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1137,7 +1137,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1169,7 +1169,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 0fa2c858ff000..ae2c61844645d 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -78,7 +78,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; AVX512VL-LABEL: var_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm3 = [63,63,63,63] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -109,7 +109,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [63,63,63,63] ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -830,7 +830,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -861,7 +861,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1070,7 +1070,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1101,7 +1101,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 665223167fbb4..5d7f36901f875 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -435,7 +435,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -464,7 +464,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -564,7 +564,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 @@ -600,7 +600,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 3d4f283260aa5..e09cc70e97890 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -830,7 +830,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -852,7 +852,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 11ea650e1f02d..c9dd5b57cd637 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -315,7 +315,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 @@ -343,7 +343,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 372deb05e550c..2283080354531 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -369,12 +369,11 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; ; AVX512VL-LABEL: splatvar_funnnel_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm3 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] -; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VL-NEXT: vpsrld %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm2 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 +; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -395,12 +394,11 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm3 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31] -; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VLBW-NEXT: vpsrld %xmm2, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm2 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index ae4f85ce42a19..4c0b438bff575 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -3138,27 +3138,28 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm7 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -3174,27 +3175,28 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -3210,27 +3212,28 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -3246,27 +3249,28 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -6519,249 +6523,253 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512BW-LABEL: load_i16_stride4_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm11 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm12 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm13 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm14 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm14 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm15 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm6 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm7, %zmm4 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride4_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride4_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 9b19ec15c6f55..988709033d6fc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -2032,8 +2032,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,4,6,0] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] @@ -2047,8 +2046,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,4,7,0] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] @@ -2209,8 +2207,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,4,6,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] @@ -2224,8 +2221,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,4,7,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] @@ -4118,8 +4114,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] -; AVX512-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,0,0,0,1,4,6,3] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] @@ -4147,8 +4142,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,4,6,0] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12)) @@ -4170,8 +4164,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,2,4,7,0] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 @@ -4487,8 +4480,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] -; AVX512DQ-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,0,0,0,1,4,6,3] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] @@ -4516,8 +4508,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,4,6,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12)) @@ -4539,8 +4530,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,2,4,7,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 @@ -8373,7 +8363,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride5_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512-FCP-NEXT: subq $616, %rsp # imm = 0x268 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8391,11 +8381,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm8 ; AVX512-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10,11],ymm1[12],ymm5[13],ymm1[14],ymm5[15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -8403,7 +8393,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm24 ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] @@ -8431,8 +8421,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm30 & (zmm7 ^ zmm4)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 @@ -8468,24 +8458,23 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512-FCP-NEXT: vpermd %ymm21, %ymm18, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm30 & (zmm2 ^ zmm1)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 ; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [2,0,0,0,4,7,1,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm26 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm19, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -8502,41 +8491,39 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] -; AVX512-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [0,0,0,0,1,4,6,3] ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm24, %ymm0 +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm23, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm21, %ymm25, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm30 & (zmm4 ^ zmm10)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm17 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm19, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm20, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8550,70 +8537,69 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3 +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm23, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512-FCP-NEXT: vpermd %ymm16, %ymm25, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm30 & (zmm2 ^ zmm0)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm19 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,5,2,5,7,0,0] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm23 +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm29 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm29 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm18 ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,0,0,0,1,4,6,0] +; AVX512-FCP-NEXT: vpermd %ymm21, %ymm26, %ymm14 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm30 & (zmm14 ^ zmm0)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm4 ; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm14 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm6 @@ -8621,7 +8607,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 @@ -8629,46 +8615,47 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm28 ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 +; AVX512-FCP-NEXT: vpermd %ymm16, %ymm26, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm30 & (zmm2 ^ zmm1)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [1,3,6,0,5,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,2,4,7,0] +; AVX512-FCP-NEXT: vpermd %ymm21, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm22 +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm23 ; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm30 +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm17, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm16, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm19 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] @@ -8682,20 +8669,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 +; AVX512-FCP-NEXT: vpermd %ymm21, %ymm26, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm4 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] @@ -8718,19 +8704,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm22 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm4 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 @@ -8741,8 +8727,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm26, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload @@ -8761,7 +8747,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] @@ -8783,8 +8769,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] @@ -8811,14 +8797,15 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm3, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm3, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512-FCP-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512-FCP-NEXT: addq $616, %rsp # imm = 0x268 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -9295,7 +9282,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512DQ-FCP-NEXT: subq $616, %rsp # imm = 0x268 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9313,11 +9300,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10,11],ymm1[12],ymm5[13],ymm1[14],ymm5[15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -9325,7 +9312,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] @@ -9353,8 +9340,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm30 & (zmm7 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 @@ -9390,24 +9377,23 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm18, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm30 & (zmm2 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [2,0,0,0,4,7,1,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm19, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -9424,41 +9410,39 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] -; AVX512DQ-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [0,0,0,0,1,4,6,3] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm24, %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm23, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm25, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm30 & (zmm4 ^ zmm10)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm19, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm20, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9472,70 +9456,69 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3 +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm23, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm25, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm30 & (zmm2 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm19 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,5,2,5,7,0,0] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm23 +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm29 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm18 ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,0,0,0,1,4,6,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm26, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm30 & (zmm14 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm14 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm6 @@ -9543,7 +9526,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 @@ -9551,46 +9534,47 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm28 ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm26, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm30 & (zmm2 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [1,3,6,0,5,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,2,4,7,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm30 +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm17, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm19 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] @@ -9604,20 +9588,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 +; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm26, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] @@ -9640,19 +9623,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm22 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 @@ -9663,8 +9646,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm26, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload @@ -9683,7 +9666,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] @@ -9705,8 +9688,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] @@ -9733,14 +9716,15 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512DQ-FCP-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512DQ-FCP-NEXT: addq $616, %rsp # imm = 0x268 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index feb75b21d5c8d..98b7ce1f62cec 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -2874,14 +2874,14 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,4,10,16,22,28,0,0,0,4,10,16,22,28] ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %zmm5, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,5,11,17,23,29,0,0,0,5,11,17,23,29] ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] @@ -2929,14 +2929,14 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,4,10,16,22,28,0,0,0,4,10,16,22,28] ; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,5,11,17,23,29,0,0,0,5,11,17,23,29] ; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] @@ -2984,14 +2984,14 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,4,10,16,22,28,0,0,0,4,10,16,22,28] ; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,5,11,17,23,29,0,0,0,5,11,17,23,29] ; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] @@ -3039,14 +3039,14 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,4,10,16,22,28,0,0,0,4,10,16,22,28] ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,5,11,17,23,29,0,0,0,5,11,17,23,29] ; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 038c73bd9fed2..0796d5defcc14 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -3385,9 +3385,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i16_stride7_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,2,6,9,13] ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,0,0,0,2,5,9,12] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9 @@ -3395,7 +3395,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,3,6,10,13] ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2] @@ -3502,8 +3502,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,0,4,7,0] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 @@ -3706,9 +3705,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,2,6,9,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,0,0,0,2,5,9,12] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9 @@ -3716,7 +3715,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,3,6,10,13] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2] @@ -3823,8 +3822,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,0,4,7,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 @@ -7120,10 +7118,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,6,9,13] ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,2,5,9,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,3,6,10,13] ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0] ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm13 @@ -7333,13 +7331,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,0,4,7,0] ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,6,9,13] ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm9, %zmm9 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15] @@ -7776,10 +7773,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,0,0,0,2,6,9,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,2,5,9,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,3,6,10,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm4 @@ -7995,8 +7992,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,0,4,7,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 @@ -14893,34 +14889,34 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1800, %rsp # imm = 0x708 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512-FCP-NEXT: subq $1832, %rsp # imm = 0x728 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm25 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm26, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermd %zmm25, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm21, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm22 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vporq %ymm4, %ymm6, %ymm17 +; AVX512-FCP-NEXT: vporq %ymm4, %ymm6, %ymm16 ; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %xmm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm18 ; AVX512-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,0,2] +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2] ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 @@ -14929,17 +14925,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm16 +; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm26 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm14 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm2 ; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm17 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] @@ -14950,13 +14946,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm7 @@ -14964,67 +14960,67 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpermd %zmm21, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm8, %ymm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7] +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm21 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,0,2] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm21[0,1,0,2] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] ; AVX512-FCP-NEXT: vmovdqa 688(%rdi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [2,6,9,0,13,0,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 -; AVX512-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermd %zmm26, %zmm23, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermd %zmm25, %zmm23, %zmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm26 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm25 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm18 = [2,5,2,5,2,5,2,5] -; AVX512-FCP-NEXT: vpermd %ymm31, %ymm18, %ymm12 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [2,5,2,5,2,5,2,5] +; AVX512-FCP-NEXT: vpermd %ymm24, %ymm17, %ymm12 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm18 ; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm19 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm14 @@ -15037,48 +15033,49 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm10 -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm23, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm23, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512-FCP-NEXT: vpor %ymm5, %ymm10, %ymm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpermd %ymm25, %ymm18, %ymm7 +; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm4 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpermd %ymm21, %ymm17, %ymm7 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpermd %zmm27, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,1,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,1,3] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] @@ -15089,49 +15086,49 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm6 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,1,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm21[0,1,1,3] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm5 -; AVX512-FCP-NEXT: vpsrlq $48, %xmm20, %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm6 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm18, %xmm9 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: vpermd %zmm21, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,3,6,10,13] +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] ; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm28, %zmm4, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm28, %zmm4, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] @@ -15140,173 +15137,175 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512-FCP-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm9 -; AVX512-FCP-NEXT: vpsrlq $48, %xmm18, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm9 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm17, %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] -; AVX512-FCP-NEXT: vpermd %ymm31, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7] +; AVX512-FCP-NEXT: vpermd %ymm24, %ymm11, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,5,9,12] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm10, %zmm8 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6,7],ymm8[8,9,10,11,12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 ; AVX512-FCP-NEXT: vpsrld $16, %xmm19, %xmm8 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7] +; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3],xmm8[4],xmm2[5],xmm8[6,7] ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,4,7,11,14] -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm13 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm8 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14] +; AVX512-FCP-NEXT: vpermd %zmm27, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3,4,5,6],xmm13[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 {%k1} # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpermd %zmm21, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm7 +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm20, %zmm13 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm26 {%k1} # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermd %ymm21, %ymm11, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm10, %zmm7 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512-FCP-NEXT: vpsrld $16, %xmm14, %xmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512-FCP-NEXT: vpsrld $16, %xmm31, %xmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm31, %ymm12, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm17, %zmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,0,0,0,0,4,7,0] +; AVX512-FCP-NEXT: vpermd %ymm24, %ymm26, %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,2,6,9,13] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7],ymm7[8,9,10,11,12],ymm5[13,14,15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512-FCP-NEXT: vpermd %zmm5, %zmm10, %zmm7 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3],xmm11[4],xmm13[5],xmm11[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3],xmm0[4],xmm13[5],xmm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm10 -; AVX512-FCP-NEXT: vpermd %zmm10, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,0,0,4,8,11,15] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm11, %zmm13 +; AVX512-FCP-NEXT: vpermd %zmm16, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,0,0,0,4,8,11,15] +; AVX512-FCP-NEXT: vpermd %zmm16, %zmm10, %zmm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4,5,6],xmm13[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm10, %zmm17, %zmm13 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4],xmm13[5],xmm1[6],xmm13[7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7] -; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm13 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6],xmm13[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512-FCP-NEXT: vpermd %zmm27, %zmm10, %zmm4 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6],xmm3[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm5, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm17 +; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpermd %zmm10, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm20, %zmm2 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm14[2],ymm7[3,4,5],ymm14[6],ymm7[7] +; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm11 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpermd %zmm21, %zmm17, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512-FCP-NEXT: vpermd %zmm27, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpermd %zmm5, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpermd %ymm21, %ymm26, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm8[2],ymm15[3,4,5],ymm8[6],ymm15[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm11 +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] @@ -15318,63 +15317,61 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 864(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 832(%rdi), %ymm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermd %zmm27, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm23 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm2 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,7,10,14,0,0,0] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermd %zmm24, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,3,7,10,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm29 & (zmm27 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm28 & (zmm25 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm8[3],ymm15[4,5],ymm8[6],ymm15[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm22 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm23 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [2,0,0,0,6,9,13,0] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm20, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,6,9,13,0] +; AVX512-FCP-NEXT: vpermd %zmm16, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] @@ -15383,31 +15380,30 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm3 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX512-FCP-NEXT: vmovdqa %xmm7, %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm3 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vpermd %zmm26, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm28 & (zmm22 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpermd %zmm27, %zmm12, %zmm3 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm20 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm20 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15416,29 +15412,30 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm27 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm27 +; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm30 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,4,7,11,14,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm24, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm10 ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm2[1],ymm9[2,3,4],ymm2[5],ymm9[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm13 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2],xmm6[3],xmm13[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] @@ -15449,44 +15446,45 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm22 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpermd %zmm26, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm21 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] -; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm21 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm18, %zmm8 +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm18, %zmm8 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2],xmm8[3],xmm12[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] @@ -15494,134 +15492,134 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm24, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm8 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,10,3,14,7,10,3] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm9, %zmm15 +; AVX512-FCP-NEXT: vpermd %zmm16, %zmm9, %zmm11 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm7[1,2],ymm15[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm4 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4],ymm10[5],ymm3[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm11 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,1,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm9, %zmm9 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm12 = mem ^ (zmm9 & (zmm12 ^ mem)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm13 = mem ^ (zmm9 & (zmm13 ^ mem)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm15 = mem ^ (zmm9 & (zmm15 ^ mem)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = zmm6 ^ (zmm9 & (zmm6 ^ mem)) ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm17 = zmm17 ^ (zmm9 & (zmm17 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm21 = zmm21 ^ (zmm9 & (zmm21 ^ mem)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm12)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm9 & (zmm11 ^ zmm13)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm9 & (zmm11 ^ zmm15)) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm19 = zmm19 ^ (zmm29 & (zmm19 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ mem)) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm10 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm29 & (zmm2 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm8 = zmm8 ^ (zmm29 & (zmm8 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm6 = zmm6 ^ (zmm29 & (zmm6 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm8 = zmm8 ^ (zmm28 & (zmm8 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm9 = zmm9 ^ (zmm28 & (zmm9 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm29 & (zmm1 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm10 = zmm10 ^ (zmm28 & (zmm10 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm28 & (zmm1 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm29 & (zmm1 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm28 & (zmm1 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm29 & (zmm4 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm28 & (zmm4 ^ mem)) ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = mem ^ (zmm29 & (zmm0 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = mem ^ (zmm28 & (zmm0 ^ mem)) ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-FCP-NEXT: addq $1800, %rsp # imm = 0x708 +; AVX512-FCP-NEXT: addq $1832, %rsp # imm = 0x728 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -16500,15 +16498,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $1240, %rsp # imm = 0x4D8 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm22 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,0,12,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm12[2],ymm14[3,4,5],ymm12[6],ymm14[7] +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u] @@ -16521,7 +16520,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm19 ; AVX512DQ-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -16533,12 +16533,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX512DQ-FCP-NEXT: vporq %ymm1, %ymm3, %ymm31 -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] @@ -16550,10 +16550,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm0[2],ymm5[3,4,5],ymm0[6],ymm5[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] @@ -16562,34 +16561,36 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm5[1],xmm13[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm27 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm2[1],xmm11[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm30 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm29 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm8, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %ymm21 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm22[0,1,0,2] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm21[0,1,0,2] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX512DQ-FCP-NEXT: vmovdqa 688(%rdi), %xmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 @@ -16606,61 +16607,60 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [2,5,2,5,2,5,2,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm16, %ymm13 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm27 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm9, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm16, %ymm7 +; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm16, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm22 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16668,14 +16668,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm9 +; AVX512DQ-FCP-NEXT: vpermd %zmm19, %zmm16, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm9, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] @@ -16692,58 +16694,58 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2],xmm6[3],xmm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm13 +; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm16, %zmm13 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm8 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm21[0,1,1,3] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm29 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm3[1],ymm11[2,3,4],ymm3[5],ymm11[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm11 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm27, %xmm7 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [0,0,0,0,3,6,10,13] +; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm1, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm20 {%k1} # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512DQ-FCP-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm30, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm22, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] @@ -16751,21 +16753,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,2,5,9,12] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7],ymm6[8,9,10,11,12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm11 -; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm19, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm12 +; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm30, %xmm6 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3],xmm6[4],xmm3[5],xmm6[6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] @@ -16777,36 +16779,36 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3,4,5,6],xmm14[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm9 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm27, %zmm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,6],ymm8[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm31 {%k1} # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm9, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm19 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,0,0,0,0,4,7,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm22, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,2,6,9,13] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm24 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 @@ -16821,8 +16823,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3],xmm14[4],xmm5[5],xmm14[6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm20, %zmm14 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3,4,5,6],xmm6[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] @@ -16837,12 +16839,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm10 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm20, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm8, %zmm10 +; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm10[6,7] @@ -16857,7 +16859,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -16868,7 +16870,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1 @@ -16899,15 +16901,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 ^ (zmm25 & (zmm1 ^ mem)) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm22, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 @@ -16923,7 +16925,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm9, %zmm6 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm9, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] @@ -16935,18 +16937,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm23 = zmm23 ^ (zmm25 & (zmm23 ^ mem)) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa 832(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm13[2],ymm4[3,4,5],ymm13[6],ymm4[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm27 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3,4,5],xmm3[6],xmm9[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 @@ -16967,16 +16968,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,3,7,10,14,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,7,10,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm25 & (zmm28 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm18 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7] @@ -16987,7 +16988,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,6,9,13,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm16, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] @@ -17006,20 +17007,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm19, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm29 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm25 & (zmm26 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm16, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] @@ -17030,8 +17032,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm26 {%k1} ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm19 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] @@ -17039,24 +17041,24 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,4,7,11,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm22, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm23 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm18 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,0,0,0,6,10,13,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] @@ -17096,8 +17098,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm9 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm9 = mem ^ (zmm25 & (zmm9 ^ mem)) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] @@ -17105,15 +17107,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,4,8,11,15,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6 @@ -17121,7 +17123,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm8 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index fff21f9aad1bb..3e10c8a2d3676 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -7264,115 +7264,134 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm12 = [2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm13 = [3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm14 = [4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm13 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm15 = [5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm15, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -7383,115 +7402,134 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: movb $-64, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm12 = [2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm13 = [3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm14 = [4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm15 = [5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -7502,115 +7540,134 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: movb $-64, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm12 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm13 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm14 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm15 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -7621,115 +7678,134 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm12 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm13 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm14 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm15 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -16316,877 +16392,1141 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512BW-LABEL: load_i16_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm13, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm18 -; AVX512BW-NEXT: movb $-64, %dil -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm8, %zmm2 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm26 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm25 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm30 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm29 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm31, %zmm24 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm19 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm22 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm27 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm31 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm1, %zmm15 +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm10, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm10 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm9, %zmm11 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm4, %zmm7 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm9, %zmm10 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm11, %zmm14 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm18 = [7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm18, %zmm8 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride8_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm13, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm18 -; AVX512BW-FCP-NEXT: movb $-64, %dil -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm13, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm8 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm21, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm26 +; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm30, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm27, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm30 +; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm31, %zmm24 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm27 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: movb $-64, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm9, %zmm11 +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm10, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm10, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm11, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm18 = [7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm18, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-FCP-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride8_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm13, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm18 -; AVX512DQ-BW-NEXT: movb $-64, %dil -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm13, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm8 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm26 +; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm30, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm29, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm30 +; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm17, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm31, %zmm24 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm27 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm15, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm1, %zmm15 +; AVX512DQ-BW-NEXT: movb $-64, %al +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm9, %zmm11 +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm10, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm11, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm18 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm18, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-BW-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm13, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm18 -; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm13, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm21, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm30, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm27, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm17, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm31, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm15, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm1, %zmm15 +; AVX512DQ-BW-FCP-NEXT: movb $-64, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm9, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm10, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm10, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm11, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm18 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm18, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-BW-FCP-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <512 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 0bf1260738439..6cda6a1a1ca88 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -1465,29 +1465,26 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1501,29 +1498,26 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1537,29 +1531,26 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1573,29 +1564,26 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1609,29 +1597,26 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1645,29 +1630,26 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1681,29 +1663,26 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1717,29 +1696,26 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,8,12,16,20,24,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,9,13,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -2976,497 +2952,473 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i32_stride4_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 -; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 -; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-NEXT: vpermt2d %zmm5, %zmm10, %zmm11 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm10 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm12 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm14 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31] +; AVX512-NEXT: vpermt2d %zmm5, %zmm7, %zmm4 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm2, %zmm14, %zmm3 +; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride4_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride4_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm10, %zmm11 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm12, %zmm13 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm12 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm7, %zmm4 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm14, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride4_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm11 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm13 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm4 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride4_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,8,12,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <128 x i32>, ptr %in.vec, align 64 @@ -6103,913 +6055,889 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i32_stride4_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,4,8,12,16,20,24,28] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-NEXT: vpermt2d %zmm4, %zmm20, %zmm15 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm19 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm20 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,5,9,13,17,21,25,29] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512-NEXT: vpermt2d %zmm4, %zmm22, %zmm23 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512-NEXT: vpermt2d %zmm13, %zmm22, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-NEXT: vpermt2d %zmm8, %zmm22, %zmm25 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm21 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm26 = [2,6,10,14,18,22,26,30] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512-NEXT: vpermt2d %zmm4, %zmm26, %zmm27 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512-NEXT: vpermt2d %zmm17, %zmm22, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512-NEXT: vpermt2d %zmm13, %zmm26, %zmm28 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512-NEXT: vpermt2d %zmm3, %zmm24, %zmm28 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm24 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 -; AVX512-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512-NEXT: vpermt2d %zmm12, %zmm22, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512-NEXT: vpermt2d %zmm8, %zmm26, %zmm29 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512-NEXT: vpermt2d %zmm17, %zmm26, %zmm16 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm17 = [3,7,11,15,19,23,27,31] +; AVX512-NEXT: vpermt2d %zmm13, %zmm17, %zmm14 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm12, %zmm26, %zmm9 +; AVX512-NEXT: vpermt2d %zmm8, %zmm17, %zmm6 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm3, %zmm26, %zmm2 +; AVX512-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm10, %zmm26, %zmm7 +; AVX512-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride4_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,4,8,12,16,20,24,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm22, %zmm23 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm24 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm22, %zmm25 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm21 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [2,6,10,14,18,22,26,30] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm26, %zmm27 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm22, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm26, %zmm28 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm28 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm26, %zmm29 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm26, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm14 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm26, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm17, %zmm6 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm26, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride4_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,4,8,12,16,20,24,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm20, %zmm15 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm19, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm19, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm19 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm20 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm22, %zmm23 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm21, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm22, %zmm24 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm22, %zmm25 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm21 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm26 = [2,6,10,14,18,22,26,30] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm26, %zmm27 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm22, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm26, %zmm28 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm24, %zmm28 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm24 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm22, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm26, %zmm29 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm26, %zmm16 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm17 = [3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm17, %zmm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm26, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm17, %zmm6 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm26, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm26, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,4,8,12,16,20,24,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm22, %zmm23 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm24 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm22, %zmm25 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [2,6,10,14,18,22,26,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm26, %zmm27 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm22, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm26, %zmm28 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm28 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm26, %zmm29 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm26, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm14 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm26, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm26, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride4_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,4,8,12,16,20,24,28] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm20, %zmm15 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm20 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm22, %zmm23 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm24 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm22, %zmm25 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm26 = [2,6,10,14,18,22,26,30] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm26, %zmm27 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm22, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm26, %zmm28 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm24, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm24 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm29 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm26, %zmm16 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm17 = [3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm17, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm26, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm17, %zmm6 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm26, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride4_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,4,8,12,16,20,24,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm15 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm22, %zmm23 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm24 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm22, %zmm25 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [2,6,10,14,18,22,26,30] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm26, %zmm27 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm22, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm26, %zmm28 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm26, %zmm29 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm26, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm26, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm17, %zmm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm26, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,4,8,12,16,20,24,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm20, %zmm15 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm22, %zmm23 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm24 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm26 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm26, %zmm27 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm22, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm26, %zmm28 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm24, %zmm28 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm29 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm26, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm17 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm17, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm26, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm17, %zmm6 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm26, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm26, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,4,8,12,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,4,8,12,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm22, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm22, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,2,6,10,14,18,22,26,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm26, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm22, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm26, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm26, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm26, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm26, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm17, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm26, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index c08442f9d9d01..44477f4b0b076 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -2011,8 +2011,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 @@ -2021,8 +2020,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 @@ -2031,8 +2029,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 @@ -2041,14 +2038,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 @@ -2072,8 +2067,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 @@ -2082,8 +2076,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 @@ -2092,8 +2085,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 @@ -2102,14 +2094,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 @@ -2133,8 +2123,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 @@ -2143,8 +2132,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 @@ -2153,8 +2141,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 @@ -2163,14 +2150,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 @@ -2194,8 +2179,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 @@ -2204,8 +2188,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 @@ -2214,8 +2197,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 @@ -2224,14 +2206,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 @@ -2255,8 +2235,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 @@ -2265,8 +2244,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 @@ -2275,8 +2253,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -2285,14 +2262,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 @@ -2316,8 +2291,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 @@ -2326,8 +2300,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 @@ -2336,8 +2309,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -2346,14 +2318,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 @@ -2377,8 +2347,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 @@ -2387,8 +2356,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 @@ -2397,8 +2365,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 @@ -2407,14 +2374,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 @@ -2438,8 +2403,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 @@ -2448,8 +2412,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 @@ -2458,8 +2421,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 @@ -2468,14 +2430,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 @@ -4170,8 +4130,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] @@ -4186,8 +4145,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] @@ -4203,8 +4161,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 ; AVX512-NEXT: movb $7, %al @@ -4219,8 +4176,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} @@ -4230,8 +4186,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} ; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 @@ -4269,8 +4224,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] @@ -4285,8 +4239,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] @@ -4302,8 +4255,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 ; AVX512-FCP-NEXT: movb $7, %al @@ -4318,8 +4270,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} @@ -4329,8 +4280,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 @@ -4368,8 +4318,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] @@ -4384,8 +4333,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] @@ -4401,8 +4349,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 ; AVX512DQ-NEXT: movb $7, %al @@ -4417,8 +4364,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} @@ -4428,8 +4374,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 @@ -4467,8 +4412,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] @@ -4483,8 +4427,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] @@ -4500,8 +4443,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 ; AVX512DQ-FCP-NEXT: movb $7, %al @@ -4516,8 +4458,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} @@ -4527,8 +4468,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 @@ -4566,8 +4506,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] @@ -4582,8 +4521,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] @@ -4599,8 +4537,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 ; AVX512BW-NEXT: movb $7, %al @@ -4615,8 +4552,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} @@ -4626,8 +4562,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 @@ -4665,8 +4600,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] @@ -4681,8 +4615,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] @@ -4698,8 +4631,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 ; AVX512BW-FCP-NEXT: movb $7, %al @@ -4714,8 +4646,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} @@ -4725,8 +4656,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 @@ -4764,8 +4694,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] @@ -4780,8 +4709,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] @@ -4797,8 +4725,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 ; AVX512DQ-BW-NEXT: movb $7, %al @@ -4813,8 +4740,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} @@ -4824,8 +4750,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 @@ -4863,8 +4788,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] @@ -4879,8 +4803,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] @@ -4896,8 +4819,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm15, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al @@ -4912,8 +4834,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} @@ -4923,8 +4844,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 @@ -8384,8 +8304,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8399,8 +8318,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8418,8 +8336,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 @@ -8429,12 +8346,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 ; AVX512-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 ; AVX512-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm27 ; AVX512-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm30 @@ -8588,8 +8503,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8603,8 +8517,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8622,8 +8535,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 @@ -8633,12 +8545,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 ; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm30 @@ -8792,8 +8702,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8807,8 +8716,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8826,8 +8734,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 @@ -8837,12 +8744,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 ; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm27 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm30 @@ -8996,8 +8901,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9011,8 +8915,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9030,8 +8933,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 @@ -9041,12 +8943,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm30 @@ -9200,8 +9100,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9215,8 +9114,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9234,8 +9132,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 @@ -9245,12 +9142,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 ; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 @@ -9404,8 +9299,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9419,8 +9313,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9438,8 +9331,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 @@ -9449,12 +9341,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 ; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm30 @@ -9608,8 +9498,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9623,8 +9512,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9642,8 +9530,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 @@ -9653,12 +9540,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 ; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm30 @@ -9812,8 +9697,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,3,8,13,18,23,28,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9827,8 +9711,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,0,4,9,14,19,24,29,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9846,8 +9729,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,0,5,10,15,20,25,30,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 @@ -9857,12 +9739,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,0,0,17,22,27,0,5,10,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,2,7,12,17,22,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm30 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 85ed61811af53..b36cf8cd34f8d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -197,7 +197,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,0,0] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 @@ -256,7 +256,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 @@ -315,7 +315,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,0,0] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 @@ -374,7 +374,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 @@ -2664,74 +2664,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512-NEXT: movw $31, %di ; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: movb $-32, %dil ; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) @@ -2752,74 +2740,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-FCP-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512-FCP-NEXT: movw $31, %di ; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: movb $-32, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) @@ -2840,74 +2816,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-NEXT: movw $31, %di ; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: movb $-32, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) @@ -2928,74 +2892,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: movw $31, %di ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: movb $-32, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) @@ -3016,74 +2968,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512BW-NEXT: movw $31, %di ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) @@ -3104,74 +3044,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: movw $31, %di ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: movb $-32, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) @@ -3192,74 +3120,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: movw $31, %di ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: movb $-32, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) @@ -3280,74 +3196,62 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movw $31, %di ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) @@ -5571,8 +5475,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] @@ -5581,8 +5484,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 ; AVX512-NEXT: movw $-2048, %di # imm = 0xF800 @@ -5593,16 +5495,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 ; AVX512-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} @@ -5614,15 +5514,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512-NEXT: movw $31, %di ; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} @@ -5634,13 +5532,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 ; AVX512-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} @@ -5649,8 +5545,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] @@ -5659,8 +5554,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 ; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 ; AVX512-NEXT: movb $-32, %dil @@ -5671,14 +5565,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} ; AVX512-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 @@ -5716,8 +5608,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] @@ -5726,8 +5617,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 ; AVX512-FCP-NEXT: movw $-2048, %di # imm = 0xF800 @@ -5738,16 +5628,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 ; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} @@ -5759,15 +5647,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512-FCP-NEXT: movw $31, %di ; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} @@ -5779,13 +5665,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 ; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} @@ -5794,8 +5678,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] @@ -5804,8 +5687,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 ; AVX512-FCP-NEXT: movb $-32, %dil @@ -5816,14 +5698,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 @@ -5861,8 +5741,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] @@ -5871,8 +5750,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 ; AVX512DQ-NEXT: movw $-2048, %di # imm = 0xF800 @@ -5883,16 +5761,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 ; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} @@ -5904,15 +5780,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512DQ-NEXT: movw $31, %di ; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} @@ -5924,13 +5798,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 ; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} @@ -5939,8 +5811,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] @@ -5949,8 +5820,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 ; AVX512DQ-NEXT: movb $-32, %dil @@ -5961,14 +5831,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 @@ -6006,8 +5874,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] @@ -6016,8 +5883,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: movw $-2048, %di # imm = 0xF800 @@ -6028,16 +5894,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} @@ -6049,15 +5913,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512DQ-FCP-NEXT: movw $31, %di ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} @@ -6069,13 +5931,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} @@ -6084,8 +5944,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] @@ -6094,8 +5953,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 ; AVX512DQ-FCP-NEXT: movb $-32, %dil @@ -6106,14 +5964,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 @@ -6151,8 +6007,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] @@ -6161,8 +6016,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 @@ -6173,16 +6027,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} @@ -6194,15 +6046,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512BW-NEXT: movw $31, %di ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} @@ -6214,13 +6064,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} @@ -6229,8 +6077,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] @@ -6239,8 +6086,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 ; AVX512BW-NEXT: movb $-32, %dil @@ -6251,14 +6097,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 @@ -6296,8 +6140,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] @@ -6306,8 +6149,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 @@ -6318,16 +6160,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} @@ -6339,15 +6179,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512BW-FCP-NEXT: movw $31, %di ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} @@ -6359,13 +6197,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} @@ -6374,8 +6210,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] @@ -6384,8 +6219,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 ; AVX512BW-FCP-NEXT: movb $-32, %dil @@ -6396,14 +6230,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 @@ -6441,8 +6273,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] @@ -6451,8 +6282,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 @@ -6463,16 +6293,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} @@ -6484,15 +6312,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512DQ-BW-NEXT: movw $31, %di ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} @@ -6504,13 +6330,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} @@ -6519,8 +6343,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] @@ -6529,8 +6352,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 ; AVX512DQ-BW-NEXT: movb $-32, %dil @@ -6541,14 +6363,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 @@ -6586,8 +6406,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] @@ -6596,8 +6415,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 @@ -6608,16 +6426,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} @@ -6629,15 +6445,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movw $31, %di ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} @@ -6649,13 +6463,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm18 {%k1} @@ -6664,8 +6476,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] @@ -6674,8 +6485,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm24 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil @@ -6686,14 +6496,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 @@ -11369,125 +11177,113 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm18 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512-NEXT: vpermt2d %zmm3, %zmm5, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512-NEXT: vpermt2d %zmm27, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vpermt2d %zmm27, %zmm6, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] +; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512-NEXT: vpermt2d %zmm3, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-NEXT: vpermt2d %zmm18, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] +; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512-NEXT: vpermt2d %zmm26, %zmm7, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 @@ -11496,7 +11292,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm22 @@ -11508,22 +11304,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-NEXT: vpermt2d %zmm22, %zmm7, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm19, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm29 ; AVX512-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 ; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 @@ -11719,125 +11515,113 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] +; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 @@ -11846,7 +11630,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 @@ -11858,22 +11642,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm7, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 @@ -12069,125 +11853,113 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm5, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm6, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] +; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm7, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 @@ -12196,7 +11968,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm22 @@ -12208,22 +11980,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm7, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 @@ -12419,125 +12191,113 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 @@ -12546,7 +12306,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 @@ -12558,22 +12318,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 @@ -12769,125 +12529,113 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 @@ -12896,7 +12644,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm22 @@ -12908,22 +12656,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 @@ -13119,125 +12867,113 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm7, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 @@ -13246,7 +12982,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 @@ -13258,22 +12994,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 @@ -13469,125 +13205,113 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm7, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 @@ -13596,7 +13320,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm22 @@ -13608,22 +13332,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 @@ -13819,125 +13543,113 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,20,26,0,6,12,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,3,9,15,21,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,21,27,1,7,13,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,0,0,6,12,18,24,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,0,1,7,13,19,25,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,2,8,14,20,26,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,9,15,21,27,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 @@ -13946,7 +13658,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 @@ -13958,22 +13670,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 7948141f6becd..82587744f52a1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -231,7 +231,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,4,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] @@ -295,7 +295,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,4,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] @@ -359,7 +359,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,4,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] @@ -423,7 +423,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,4,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] @@ -3221,115 +3221,103 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512-NEXT: vpermi2d %zmm3, %zmm8, %zmm9 ; AVX512-NEXT: movb $-32, %dil ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,8,15,22,29,0,0,0] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512-NEXT: kmovw %edi, %k2 -; AVX512-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512-NEXT: vpermi2d %zmm3, %zmm9, %zmm10 ; AVX512-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] -; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] -; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [18,25,0,7,14,0,0,0] +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512-NEXT: vpermi2d %zmm3, %zmm10, %zmm11 +; AVX512-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [19,26,1,8,15,0,0,0] +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,11,18,25] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm11 +; AVX512-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512-NEXT: vpermi2d %zmm3, %zmm12, %zmm13 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm14 = [5,12,19,26] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512-NEXT: vinserti32x4 $0, %xmm14, %zmm12, %zmm12 +; AVX512-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512-NEXT: vpermi2d %zmm3, %zmm13, %zmm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,13,20,27] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm12, (%r10) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -3338,115 +3326,103 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm8, %zmm9 ; AVX512-FCP-NEXT: movb $-32, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,8,15,22,29,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512-FCP-NEXT: kmovw %edi, %k2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm9, %zmm10 ; AVX512-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [18,25,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [19,26,1,8,15,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,11,18,25] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [5,12,19,26] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm13, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,13,20,27] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -3455,115 +3431,103 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm8, %zmm9 ; AVX512DQ-NEXT: movb $-32, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,8,15,22,29,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-NEXT: kmovw %edi, %k2 -; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm9, %zmm10 ; AVX512DQ-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [18,25,0,7,14,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm10, %zmm11 +; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [19,26,1,8,15,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,11,18,25] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm11 +; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm12, %zmm13 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm14 = [5,12,19,26] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm14, %zmm12, %zmm12 +; AVX512DQ-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm13, %zmm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,13,20,27] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r10) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -3572,115 +3536,103 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: movb $-32, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,8,15,22,29,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [18,25,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [19,26,1,8,15,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm13, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -3689,115 +3641,103 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm8, %zmm9 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,8,15,22,29,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm9, %zmm10 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [18,25,0,7,14,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [19,26,1,8,15,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,11,18,25] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm12, %zmm13 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [5,12,19,26] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm14, %zmm12, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm13, %zmm6 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,13,20,27] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3806,115 +3746,103 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: movb $-32, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,8,15,22,29,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [18,25,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [19,26,1,8,15,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm13, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -3923,115 +3851,103 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: movb $-32, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm14, %zmm12, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm13, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -4040,115 +3956,103 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm13, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -6929,8 +6833,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] @@ -6943,8 +6846,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] ; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512-NEXT: movb $-32, %dil ; AVX512-NEXT: kmovw %edi, %k2 @@ -6965,122 +6867,111 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm20 ; AVX512-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] ; AVX512-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 ; AVX512-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} ; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 ; AVX512-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm18 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm18, %zmm16 {%k2} ; AVX512-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512-NEXT: vpermt2d %zmm12, %zmm20, %zmm21 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm18 = [18,25,0,7,14,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512-NEXT: vpermt2d %zmm15, %zmm21, %zmm22 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512-NEXT: vpermt2d %zmm13, %zmm23, %zmm22 +; AVX512-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm21 +; AVX512-NEXT: vpermt2d %zmm5, %zmm23, %zmm21 +; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm20 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm21, %zmm18 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm20 = [19,26,1,8,15,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512-NEXT: vpermt2d %zmm10, %zmm20, %zmm21 +; AVX512-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512-NEXT: vpermt2d %zmm9, %zmm23, %zmm24 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512-NEXT: vpermt2d %zmm13, %zmm25, %zmm24 +; AVX512-NEXT: vmovdqa32 %zmm24, %zmm21 {%k1} +; AVX512-NEXT: vpermi2d %zmm3, %zmm6, %zmm23 +; AVX512-NEXT: vpermt2d %zmm5, %zmm25, %zmm23 +; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm22 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 ; AVX512-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 -; AVX512-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512-NEXT: vpermt2d %zmm9, %zmm24, %zmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512-NEXT: vpermt2d %zmm13, %zmm25, %zmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512-NEXT: vpermt2d %zmm12, %zmm26, %zmm22 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,11,18,25] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512-NEXT: vpermt2d %zmm11, %zmm27, %zmm28 +; AVX512-NEXT: vinserti32x4 $0, %xmm28, %zmm22, %zmm22 +; AVX512-NEXT: vmovdqa32 %zmm23, %zmm22 {%k1} +; AVX512-NEXT: vpermi2d %zmm3, %zmm6, %zmm24 +; AVX512-NEXT: vpermt2d %zmm5, %zmm25, %zmm24 +; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm26 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm27 +; AVX512-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm23 +; AVX512-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 -; AVX512-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512-NEXT: vpermt2d %zmm15, %zmm24, %zmm25 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512-NEXT: vpermt2d %zmm13, %zmm26, %zmm25 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512-NEXT: vpermt2d %zmm12, %zmm27, %zmm28 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm29 = [5,12,19,26] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm24 +; AVX512-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm27 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 -; AVX512-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm15, %zmm24, %zmm9 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] ; AVX512-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 +; AVX512-NEXT: vpermt2d %zmm6, %zmm24, %zmm3 ; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 ; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 @@ -7090,14 +6981,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%r10) -; AVX512-NEXT: vmovdqa64 %zmm29, (%r10) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r10) +; AVX512-NEXT: vmovdqa64 %zmm28, (%r10) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-NEXT: vzeroupper @@ -7121,8 +7012,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] @@ -7135,8 +7025,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512-FCP-NEXT: movb $-32, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 @@ -7157,122 +7046,111 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 ; AVX512-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm18 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm16 {%k2} ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [18,25,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm23, %zmm22 +; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm21 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm23, %zmm21 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm20 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [19,26,1,8,15,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm23, %zmm24 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm25, %zmm24 +; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm23 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm22 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 ; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 ; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm24, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm25, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm26, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,11,18,25] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm27, %zmm28 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm22, %zmm22 +; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm22 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm24 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm26 +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm23 +; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm24, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm26, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm27, %zmm28 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [5,12,19,26] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm27 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm24, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm24, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 @@ -7282,14 +7160,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%r10) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%r10) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%r10) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%r10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-FCP-NEXT: vzeroupper @@ -7313,8 +7191,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] @@ -7327,8 +7204,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-NEXT: movb $-32, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 @@ -7349,122 +7225,111 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 ; AVX512DQ-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm18 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm16 {%k2} ; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm20, %zmm21 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm18 = [18,25,0,7,14,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm21, %zmm22 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm23, %zmm22 +; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm21 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm23, %zmm21 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm20 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm18 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm20 = [19,26,1,8,15,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm20, %zmm21 +; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm23, %zmm24 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm25, %zmm24 +; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm21 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm6, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm25, %zmm23 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm22 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 ; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 ; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm24, %zmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm25, %zmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm26, %zmm22 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,11,18,25] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm27, %zmm28 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm28, %zmm22, %zmm22 +; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm22 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm6, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm25, %zmm24 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm26 +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm27 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm23 +; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm24, %zmm25 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm26, %zmm25 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm27, %zmm28 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm29 = [5,12,19,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm27 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm24, %zmm9 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm24, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 @@ -7474,14 +7339,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%r10) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, (%r10) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r10) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%r10) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-NEXT: vzeroupper @@ -7505,8 +7370,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] @@ -7519,8 +7383,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-FCP-NEXT: movb $-32, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 @@ -7541,122 +7404,111 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 ; AVX512DQ-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm16 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [18,25,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm23, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm23, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [19,26,1,8,15,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm23, %zmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm25, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm23 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm24, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm25, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm26, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm27, %zmm28 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm22, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm26 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm24, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm26, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm27, %zmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm27 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm24, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm24, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 @@ -7666,14 +7518,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%r10) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -7697,8 +7549,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] @@ -7711,8 +7562,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 @@ -7733,122 +7583,111 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm18 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm21 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [18,25,0,7,14,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm22 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm23, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm23, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [19,26,1,8,15,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm23, %zmm24 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm25, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm25, %zmm23 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm23 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm25, %zmm23 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm26, %zmm22 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm27, %zmm28 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm28, %zmm22, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm22 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm25, %zmm24 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm27 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm23 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm24, %zmm25 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm26, %zmm25 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm27, %zmm28 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [5,12,19,26] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm27 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm24, %zmm9 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm24, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 @@ -7858,14 +7697,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -7889,8 +7728,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] @@ -7903,8 +7741,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512BW-FCP-NEXT: movb $-32, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 @@ -7925,122 +7762,111 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 ; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [18,25,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm23, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm23, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [19,26,1,8,15,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm23, %zmm24 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm25, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm24, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm25, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm26, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm27, %zmm28 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm22, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm24 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm26 +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm27 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm26, %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm27, %zmm28 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm27 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm24, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm24, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 @@ -8050,14 +7876,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r10) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -8081,8 +7907,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] @@ -8095,8 +7920,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-BW-NEXT: movb $-32, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 @@ -8117,122 +7941,111 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 ; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm18 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm23, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm23, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm23, %zmm24 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm25, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm25, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm25, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm26, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm27, %zmm28 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm28, %zmm22, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm25, %zmm24 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm26 +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm27 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm26, %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm27, %zmm28 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm27 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm24, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm24, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 @@ -8242,14 +8055,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r10) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, (%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -8273,8 +8086,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] @@ -8287,8 +8099,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 @@ -8309,122 +8120,111 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm20 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm23, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm23, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm25, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm24, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm25, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm26, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm27, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm22, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm26, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm27, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm6, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm26, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm29 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm28, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm27, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm24, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm15, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm21, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm24, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 @@ -8434,14 +8234,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -14172,302 +13972,302 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i32_stride7_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-NEXT: vpermt2d %zmm19, %zmm6, %zmm9 +; AVX512-NEXT: vpermi2d %zmm20, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-NEXT: vpermt2d %zmm19, %zmm8, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512-NEXT: vpermi2d %zmm20, %zmm2, %zmm8 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512-NEXT: vpermt2d %zmm19, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512-NEXT: vpermi2d %zmm20, %zmm2, %zmm15 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512-NEXT: vpermt2d %zmm3, %zmm21, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512-NEXT: vpermt2d %zmm4, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512-NEXT: vpermt2d %zmm5, %zmm21, %zmm24 +; AVX512-NEXT: vpermi2d %zmm2, %zmm20, %zmm21 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512-NEXT: vpermt2d %zmm3, %zmm25, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512-NEXT: vpermt2d %zmm4, %zmm25, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm5, %zmm25, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512-NEXT: vpermi2d %zmm2, %zmm20, %zmm25 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-NEXT: vpermt2d %zmm1, %zmm30, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2d %zmm19, %zmm30, %zmm4 +; AVX512-NEXT: vpermi2d %zmm20, %zmm6, %zmm30 +; AVX512-NEXT: vpermt2d %zmm20, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm14, %zmm2, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512-NEXT: vpermt2d %zmm14, %zmm2, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512-NEXT: vpermt2d %zmm14, %zmm2, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512-NEXT: vpermt2d %zmm14, %zmm2, %zmm22 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm21 +; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512-NEXT: vpermt2d %zmm14, %zmm2, %zmm26 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm28 +; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512-NEXT: vpermt2d %zmm14, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm30 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512-NEXT: vpermt2d %zmm14, %zmm2, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm3, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm23, %zmm31, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm23, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2d %zmm23, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2d %zmm23, %zmm6, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2d %zmm23, %zmm29, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512-NEXT: vpermi2d %zmm2, %zmm16, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vpermt2d %zmm25, %zmm31, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm16, %zmm2, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vpermt2d %zmm25, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm16, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm25, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm16, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm25, %zmm6, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm16, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm25, %zmm29, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm16, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm16 +; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm24 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [18,25,0,7,14,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512-NEXT: vpermt2d %zmm18, %zmm6, %zmm27 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vpermt2d %zmm18, %zmm7, %zmm29 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm19 = [4,11,18,25] +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] -; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm20 = [5,12,19,26] +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm21 = [6,13,20,27] +; AVX512-NEXT: vpermt2d %zmm0, %zmm21, %zmm18 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm13 ; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2d %zmm4, %zmm6, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512-NEXT: vpermt2d %zmm4, %zmm7, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm22 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm28 +; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512-NEXT: vpermt2d %zmm9, %zmm21, %zmm5 ; AVX512-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14477,14 +14277,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm30, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512-NEXT: movw $480, %ax # imm = 0x1E0 @@ -14496,15 +14296,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm26 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm31, %zmm3 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14512,17 +14311,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14531,88 +14331,92 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm22, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm30, %zmm22 {%k1} +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm19, %zmm21, %zmm19 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm28, %zmm27, %zmm28 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm30, %zmm28 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm30 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm31, %zmm30 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm31 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm29, %zmm20 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm21, %zmm4 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm16, %zmm5 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm24, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -14620,302 +14424,302 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i32_stride7_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm8, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm24 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm25, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm20, %zmm25 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm30, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm6, %zmm30 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm31, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm31, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm29, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm16, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm31, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm29, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm24 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [18,25,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm6, %zmm27 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm29 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [4,11,18,25] +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [5,12,19,26] +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm21 = [6,13,20,27] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm7, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm22 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm28 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm5 ; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14925,14 +14729,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512-FCP-NEXT: movw $480, %ax # imm = 0x1E0 @@ -14944,15 +14748,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm26 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm31, %zmm3 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14960,17 +14763,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14979,88 +14783,92 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm21, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm27, %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm28 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm30 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm31, %zmm30 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm31 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm29, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm4 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -15068,302 +14876,302 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-LABEL: load_i32_stride7_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm6, %zmm9 +; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm8, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm15, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm2, %zmm15 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm21, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm21, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm21, %zmm24 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm20, %zmm21 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm25, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm25, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm20, %zmm25 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm30, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm30, %zmm4 +; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm6, %zmm30 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm2, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm2, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm2, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm28 +; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm31, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm31, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm29, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm16, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm31, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm2, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm6, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm29, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm24 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [18,25,0,7,14,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm6, %zmm27 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm7, %zmm29 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm19 = [4,11,18,25] +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm20 = [5,12,19,26] +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm21 = [6,13,20,27] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm21, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm6, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm7, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm22 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm20, %zmm28 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm21, %zmm5 ; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15373,14 +15181,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-NEXT: movw $480, %ax # imm = 0x1E0 @@ -15392,15 +15200,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm26 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm31, %zmm3 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15408,17 +15215,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15427,88 +15235,92 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm19, %zmm21, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm28, %zmm27, %zmm28 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm28 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm30 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm31, %zmm30 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm31 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm29, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm4 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm5 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 64(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512DQ-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -15516,302 +15328,302 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i32_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm8, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm25, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm20, %zmm25 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm30, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm6, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm31, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm6, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm29, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm16, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm31, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm29, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm24 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [18,25,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm6, %zmm27 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm29 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm21 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm7, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm22 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm28 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm5 ; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15821,14 +15633,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: movw $480, %ax # imm = 0x1E0 @@ -15840,15 +15652,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm26 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm31, %zmm3 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15856,17 +15667,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15875,88 +15687,92 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm21, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm27, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm28 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm30 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm31, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm31 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm29, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512DQ-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -15964,302 +15780,302 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm6, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm2, %zmm8 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm2, %zmm15 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm24 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm20, %zmm21 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm25, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm20, %zmm25 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm30, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm6, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm31, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm29, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm16, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm31, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm24 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [18,25,0,7,14,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm27 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm29 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [5,12,19,26] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm21 = [6,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm22 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm5 ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16269,14 +16085,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: movw $480, %ax # imm = 0x1E0 @@ -16288,15 +16104,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16304,17 +16119,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16323,88 +16139,92 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm21, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm28, %zmm27, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm30 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm31 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -16412,302 +16232,302 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-LABEL: load_i32_stride7_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm8, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm24 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm25, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm20, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm30, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm6, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm31, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm29, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm16, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm31, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm29, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm24 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [18,25,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm6, %zmm27 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm29 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm21 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm7, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm22 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm28 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm5 ; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16717,14 +16537,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: movw $480, %ax # imm = 0x1E0 @@ -16736,15 +16556,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm26 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm31, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16752,17 +16571,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16771,88 +16591,92 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm21, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm27, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm28 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm30 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm31, %zmm30 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm31 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -16860,302 +16684,302 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-LABEL: load_i32_stride7_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm8, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm2, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm24 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm25, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm20, %zmm25 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm30, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm6, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm31, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm31, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm29, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm16, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm31, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm29, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm24 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm27 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm29 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm21 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm22 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm28 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm5 ; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17165,14 +16989,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: movw $480, %ax # imm = 0x1E0 @@ -17184,15 +17008,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm31, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17200,17 +17023,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17219,88 +17043,92 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm19, %zmm21, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm28, %zmm27, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm28 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm30 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm31, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm31 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm29, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512DQ-BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -17308,302 +17136,302 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm8, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm2, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,0,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm25, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm20, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm30, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm6, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,10,11,12,13,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,19,26] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,21,28] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,13,22,29] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,16,23,30] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,9,10,11,12,17,24,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm2, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,19,26,1,8,15,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm31, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,18,25,0,7,14,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm31, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm29, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm16, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm31, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm29, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm2, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm6, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm21 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm7, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm21, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17613,14 +17441,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $480, %ax # imm = 0x1E0 @@ -17632,15 +17460,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm26 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm31, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17648,17 +17475,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17667,88 +17495,92 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm21, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm27, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm28 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm30 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm31, %zmm30 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm31 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm27, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 13410fb5cc4b8..6470c69a17a83 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -3393,115 +3393,126 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm9 ; AVX512-NEXT: movb $-64, %dil ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,8,16,24] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,9,17,25] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,9,17,25] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,10,18,26] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm12 = [2,10,18,26] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,3,11,19,27] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,19,27] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,4,12,20,28] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm14 = [4,12,20,28] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,5,13,21,29] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm15 = [5,13,21,29] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,6,14,22,30] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm14 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,7,15,23,31] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -3512,115 +3523,126 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm9 ; AVX512-FCP-NEXT: movb $-64, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,8,16,24] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,9,17,25] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,9,17,25] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,10,18,26] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [2,10,18,26] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,3,11,19,27] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,19,27] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,4,12,20,28] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [4,12,20,28] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,5,13,21,29] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [5,13,21,29] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,6,14,22,30] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,7,15,23,31] +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -3631,115 +3653,126 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm9 ; AVX512DQ-NEXT: movb $-64, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,8,16,24] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,9,17,25] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,9,17,25] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,10,18,26] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm12 = [2,10,18,26] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,3,11,19,27] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,19,27] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,4,12,20,28] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm14 = [4,12,20,28] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,5,13,21,29] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm15 = [5,13,21,29] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,6,14,22,30] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm14 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,7,15,23,31] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -3750,115 +3783,126 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: movb $-64, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,8,16,24] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,9,17,25] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,9,17,25] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,10,18,26] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [2,10,18,26] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,3,11,19,27] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,19,27] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,4,12,20,28] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [4,12,20,28] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,5,13,21,29] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [5,13,21,29] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,6,14,22,30] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,7,15,23,31] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -3869,115 +3913,126 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,8,16,24] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,9,17,25] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,9,17,25] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,10,18,26] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm12 = [2,10,18,26] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,3,11,19,27] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,19,27] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,4,12,20,28] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [4,12,20,28] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,5,13,21,29] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm15 = [5,13,21,29] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,6,14,22,30] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm14 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,7,15,23,31] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3988,115 +4043,126 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: movb $-64, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,8,16,24] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,9,17,25] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,9,17,25] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,10,18,26] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [2,10,18,26] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,3,11,19,27] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,19,27] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,4,12,20,28] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [4,12,20,28] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,5,13,21,29] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [5,13,21,29] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,6,14,22,30] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,7,15,23,31] +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -4107,115 +4173,126 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: movb $-64, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,8,16,24] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,9,17,25] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,9,17,25] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,10,18,26] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm12 = [2,10,18,26] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,3,11,19,27] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,19,27] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,4,12,20,28] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [4,12,20,28] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,5,13,21,29] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm15 = [5,13,21,29] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,6,14,22,30] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,7,15,23,31] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -4226,115 +4303,126 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -7375,1753 +7463,2217 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i32_stride8_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 -; AVX512-NEXT: movb $-64, %dil -; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 -; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 -; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-NEXT: vpermt2d %zmm23, %zmm8, %zmm2 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 +; AVX512-NEXT: vpermi2d %zmm12, %zmm17, %zmm8 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512-NEXT: vpermt2d %zmm23, %zmm1, %zmm14 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512-NEXT: vpermt2d %zmm13, %zmm18, %zmm26 +; AVX512-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512-NEXT: vpermt2d %zmm13, %zmm22, %zmm25 +; AVX512-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512-NEXT: vpermt2d %zmm23, %zmm30, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512-NEXT: vpermt2d %zmm23, %zmm29, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm23, %zmm1, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm23 +; AVX512-NEXT: vpermi2d %zmm12, %zmm17, %zmm30 +; AVX512-NEXT: vpermi2d %zmm12, %zmm17, %zmm29 +; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512-NEXT: vpermt2d %zmm13, %zmm31, %zmm24 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm13, %zmm1, %zmm28 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512-NEXT: vpermi2d %zmm3, %zmm15, %zmm7 +; AVX512-NEXT: vpermi2d %zmm3, %zmm15, %zmm21 +; AVX512-NEXT: vpermi2d %zmm3, %zmm15, %zmm19 +; AVX512-NEXT: vpermi2d %zmm3, %zmm15, %zmm18 +; AVX512-NEXT: vpermi2d %zmm3, %zmm15, %zmm22 +; AVX512-NEXT: vpermi2d %zmm3, %zmm15, %zmm27 +; AVX512-NEXT: vpermi2d %zmm3, %zmm15, %zmm31 +; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm15 +; AVX512-NEXT: movb $-64, %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm13, %zmm10, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512-NEXT: vpermi2d %zmm6, %zmm8, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,9,17,25] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 -; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm10 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,17,25] +; AVX512-NEXT: vpermt2d %zmm13, %zmm7, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,10,18,26] +; AVX512-NEXT: vpermt2d %zmm3, %zmm9, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,3,11,19,27] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,4,12,20,28] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,5,13,21,29] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,14,22,30] +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,7,15,23,31] +; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm16, %zmm12, %zmm14 +; AVX512-NEXT: vpermi2d %zmm16, %zmm12, %zmm9 +; AVX512-NEXT: vpermi2d %zmm16, %zmm12, %zmm10 +; AVX512-NEXT: vpermi2d %zmm16, %zmm12, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm16, %zmm12, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512-NEXT: vpermi2d %zmm6, %zmm8, %zmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,10,18,26] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,19,27] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-NEXT: vpermt2d %zmm13, %zmm4, %zmm7 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512-NEXT: vpermi2d %zmm6, %zmm8, %zmm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 -; AVX512-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 -; AVX512-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 -; AVX512-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 -; AVX512-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-NEXT: vpermt2d %zmm13, %zmm9, %zmm10 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm11 = [5,13,21,29] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512-NEXT: vpermt2d %zmm13, %zmm11, %zmm14 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,14,22,30] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2d %zmm13, %zmm3, %zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm18 = [7,15,23,31] +; AVX512-NEXT: vpermt2d %zmm13, %zmm18, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-NEXT: vpermi2d %zmm6, %zmm8, %zmm9 +; AVX512-NEXT: vpermi2d %zmm6, %zmm8, %zmm11 +; AVX512-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm8 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride8_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 -; AVX512-FCP-NEXT: movb $-64, %dil -; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 -; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-FCP-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm8 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm26 +; AVX512-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm25 +; AVX512-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm29, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm23 +; AVX512-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm30 +; AVX512-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm29 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm31, %zmm24 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm7 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm21 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm19 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm18 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm22 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm27 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm31 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm15 +; AVX512-FCP-NEXT: movb $-64, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,9,17,25] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,17,25] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,10,18,26] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,3,11,19,27] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,4,12,20,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,5,13,21,29] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,14,22,30] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,7,15,23,31] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm9 +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,10,18,26] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,19,27] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-FCP-NEXT: vzeroupper -; AVX512-FCP-NEXT: retq -; -; AVX512DQ-LABEL: load_i32_stride8_vf32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 -; AVX512DQ-NEXT: movb $-64, %dil -; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 -; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [5,13,21,29] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,14,22,30] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [7,15,23,31] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm11 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm8 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-FCP-NEXT: addq $1288, %rsp # imm = 0x508 +; AVX512-FCP-NEXT: vzeroupper +; AVX512-FCP-NEXT: retq +; +; AVX512DQ-LABEL: load_i32_stride8_vf32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 -; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm8, %zmm2 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm17, %zmm8 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm1, %zmm14 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm18, %zmm26 +; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm22, %zmm25 +; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm30, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm29, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm1, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm23 +; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm17, %zmm30 +; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm17, %zmm29 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm31, %zmm24 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm1, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm15, %zmm7 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm15, %zmm21 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm15, %zmm19 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm15, %zmm18 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm15, %zmm22 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm15, %zmm27 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm15, %zmm31 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm15 +; AVX512DQ-NEXT: movb $-64, %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm8, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,9,17,25] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 -; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm10 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,17,25] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm7, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,10,18,26] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm9, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,3,11,19,27] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,4,12,20,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,5,13,21,29] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,14,22,30] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,7,15,23,31] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm12, %zmm14 +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm12, %zmm9 +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm12, %zmm10 +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm12, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm16, %zmm12, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,10,18,26] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,19,27] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm8, %zmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm11 = [5,13,21,29] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm11, %zmm14 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,14,22,30] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm18 = [7,15,23,31] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm18, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm8, %zmm9 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm8, %zmm11 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm18, %zmm8 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride8_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 -; AVX512DQ-FCP-NEXT: movb $-64, %dil -; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FCP-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm8 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm26 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm25 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm23 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm30 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm31, %zmm24 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm27 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm15 +; AVX512DQ-FCP-NEXT: movb $-64, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,9,17,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,17,25] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,10,18,26] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,4,12,20,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,5,13,21,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,14,22,30] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,7,15,23,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,10,18,26] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [5,13,21,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,14,22,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [7,15,23,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm11 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FCP-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 -; AVX512BW-NEXT: movb $-64, %dil -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm8, %zmm2 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm25 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm23 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm30 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm31, %zmm24 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm15 +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,9,17,25] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm10 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,17,25] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm7, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,10,18,26] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,3,11,19,27] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,4,12,20,28] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,5,13,21,29] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,14,22,30] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,7,15,23,31] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,10,18,26] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,19,27] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm7 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm9, %zmm10 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [5,13,21,29] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm11, %zmm14 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,14,22,30] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm3, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [7,15,23,31] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm8 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride8_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 -; AVX512BW-FCP-NEXT: movb $-64, %dil -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-FCP-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm8 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm26 +; AVX512BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm30 +; AVX512BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm31, %zmm24 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm27 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: movb $-64, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,9,17,25] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,17,25] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,10,18,26] +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,4,12,20,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,5,13,21,29] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,14,22,30] +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,7,15,23,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,10,18,26] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [5,13,21,29] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,14,22,30] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [7,15,23,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-FCP-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride8_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 -; AVX512DQ-BW-NEXT: movb $-64, %dil -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm8 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm26 +; AVX512DQ-BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm30, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm29, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm30 +; AVX512DQ-BW-NEXT: vpermi2d %zmm12, %zmm17, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm31, %zmm24 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm27 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm15, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm15 +; AVX512DQ-BW-NEXT: movb $-64, %al +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,9,17,25] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,17,25] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,10,18,26] +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,4,12,20,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,5,13,21,29] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,14,22,30] +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,7,15,23,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm16, %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,10,18,26] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [5,13,21,29] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm11, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,14,22,30] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [7,15,23,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-BW-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 -; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm12, %zmm17, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm31, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm15, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm15 +; AVX512DQ-BW-FCP-NEXT: movb $-64, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm16, %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm26, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm9 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm10 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm28, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-BW-FCP-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 @@ -15664,4745 +16216,4633 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i32_stride8_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovaps 1664(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 -; AVX512-NEXT: movb $-64, %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-NEXT: vpermt2d %zmm22, %zmm10, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-NEXT: vpermt2d %zmm24, %zmm10, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512-NEXT: vpermt2d %zmm13, %zmm10, %zmm30 +; AVX512-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2d %zmm13, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2d %zmm22, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-NEXT: vpermt2d %zmm24, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-NEXT: vpermt2d %zmm22, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-NEXT: vpermt2d %zmm24, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vpermt2d %zmm22, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2d %zmm13, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2d %zmm13, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2d %zmm13, %zmm6, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2d %zmm13, %zmm8, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm15, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512-NEXT: vpermt2d %zmm0, %zmm13, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm11, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm12, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512-NEXT: vpermi2d %zmm8, %zmm17, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm20 +; AVX512-NEXT: vpermi2d %zmm8, %zmm17, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm22 +; AVX512-NEXT: vpermt2d %zmm0, %zmm11, %zmm22 +; AVX512-NEXT: vpermi2d %zmm8, %zmm17, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm28 +; AVX512-NEXT: vpermi2d %zmm8, %zmm17, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm8, %zmm17, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm8, %zmm17, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512-NEXT: vpermt2d %zmm0, %zmm13, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm8, %zmm17, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm17 +; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512-NEXT: movb $-64, %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,8,16,24] +; AVX512-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512-NEXT: vpermi2d %zmm30, %zmm13, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,1,9,17,25] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,9,17,25] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512-NEXT: vpermt2d %zmm19, %zmm2, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512-NEXT: vpermt2d %zmm21, %zmm2, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-NEXT: vpermt2d %zmm29, %zmm2, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,2,10,18,26] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,10,18,26] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,3,11,19,27] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,19,27] +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,12,20,28] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,12,20,28] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm31 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm28 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,5,13,21,29] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-NEXT: vpermt2d %zmm16, %zmm7, %zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,13,21,29] +; AVX512-NEXT: vpermt2d %zmm19, %zmm2, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-NEXT: vpermt2d %zmm8, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512-NEXT: vpermt2d %zmm21, %zmm2, %zmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,14,22,30] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm11 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,7,15,23,31] +; AVX512-NEXT: vpermt2d %zmm8, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-NEXT: vpermt2d %zmm18, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-NEXT: vpermt2d %zmm18, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2d %zmm18, %zmm10, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm9, %zmm15, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm8 +; AVX512-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2d %zmm9, %zmm10, %zmm15 +; AVX512-NEXT: vpermt2d %zmm16, %zmm10, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm20 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-NEXT: vpermt2d %zmm29, %zmm2, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,14,22,30] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512-NEXT: vpermt2d %zmm21, %zmm7, %zmm9 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [7,15,23,31] +; AVX512-NEXT: vpermt2d %zmm21, %zmm10, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512-NEXT: vpermt2d %zmm29, %zmm7, %zmm14 +; AVX512-NEXT: vpermt2d %zmm29, %zmm10, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512-NEXT: vpermi2d %zmm30, %zmm13, %zmm7 +; AVX512-NEXT: vpermt2d %zmm30, %zmm10, %zmm13 +; AVX512-NEXT: vpermt2d %zmm19, %zmm10, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: # ymm7 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm25, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm17 {%k1} +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, (%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 192(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 128(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 192(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, (%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 64(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 128(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 192(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, (%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 64(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 128(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 192(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, (%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 64(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 192(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 128(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 64(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm11, (%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 192(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, (%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 64(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 128(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, (%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, (%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm26, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm31, 128(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 192(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, (%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 64(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 128(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride8_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 -; AVX512-FCP-NEXT: movb $-64, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512-FCP-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm10, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm30 +; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm20 +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm22 +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm28 +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm17 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512-FCP-NEXT: movb $-64, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,8,16,24] +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,1,9,17,25] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,9,17,25] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,2,10,18,26] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,10,18,26] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,3,11,19,27] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,19,27] +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,12,20,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,12,20,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,5,13,21,29] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,13,21,29] +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,14,22,30] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,7,15,23,31] +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,14,22,30] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [7,15,23,31] +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm10, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm7 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm25, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm10 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, (%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, (%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, (%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, (%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, (%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 192(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 128(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 64(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm11, (%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 192(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, (%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 64(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 128(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, (%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 128(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, (%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 128(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512-FCP-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride8_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovaps 1664(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 -; AVX512DQ-NEXT: movb $-64, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm10, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm10, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm10, %zmm30 +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm6, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm8, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm13, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm11, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm12, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm20 +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm17, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm11, %zmm22 +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm17, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm28 +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm17, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm17, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm13, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm17, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm17 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512DQ-NEXT: movb $-64, %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,8,16,24] +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQ-NEXT: vpermi2d %zmm30, %zmm13, %zmm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,1,9,17,25] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,9,17,25] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,2,10,18,26] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,10,18,26] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,3,11,19,27] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,19,27] +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,12,20,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,12,20,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm28 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,5,13,21,29] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,13,21,29] +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm2, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,14,22,30] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm11 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,7,15,23,31] +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm10, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm10, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm15, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm10, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm10, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm2, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,14,22,30] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm7, %zmm9 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [7,15,23,31] +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm10, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm10, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermi2d %zmm30, %zmm13, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm10, %zmm13 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm10, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm25, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm17 {%k1} +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, (%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, (%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, (%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 192(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, (%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 64(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 128(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 192(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, (%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 64(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 128(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 192(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 128(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 64(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm11, (%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 192(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, (%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 64(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 128(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, (%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 128(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, (%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 128(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: movb $-64, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm30 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm28 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: movb $-64, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,8,16,24] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,1,9,17,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,9,17,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,2,10,18,26] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,10,18,26] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,12,20,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,12,20,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,5,13,21,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,13,21,29] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,14,22,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,7,15,23,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,14,22,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [7,15,23,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm7 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm25, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm10 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm11, (%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, (%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 128(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 128(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovaps 1664(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm30 +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,8,16,24] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,1,9,17,25] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,9,17,25] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,2,10,18,26] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,10,18,26] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,3,11,19,27] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,19,27] +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,12,20,28] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,12,20,28] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,5,13,21,29] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm7, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,13,21,29] +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,14,22,30] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm11 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,7,15,23,31] +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,14,22,30] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm7, %zmm9 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [7,15,23,31] +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, (%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm11, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride8_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: movb $-64, %al -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm30 +; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm28 +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: movb $-64, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,8,16,24] +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,1,9,17,25] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,9,17,25] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,2,10,18,26] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,10,18,26] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,12,20,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,12,20,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,5,13,21,29] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,13,21,29] +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,14,22,30] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,7,15,23,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,14,22,30] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [7,15,23,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm7 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm25, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm10 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm11, (%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, (%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, (%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 128(%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 128(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-FCP-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride8_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovaps 1664(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: movb $-64, %al -; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm30 +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm28 +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: movb $-64, %al +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,8,16,24] +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,1,9,17,25] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,9,17,25] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,2,10,18,26] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,10,18,26] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,12,20,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,12,20,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,5,13,21,29] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,13,21,29] +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,14,22,30] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,7,15,23,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,14,22,30] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [7,15,23,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2d %zmm30, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm7 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm25, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm10 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm11, (%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, (%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, (%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 128(%r9) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 128(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-BW-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: movb $-64, %al -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm17, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: movb $-64, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm30, %zmm13, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm25, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm10 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm23, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <512 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll index 0c7c3f4b16646..7647d150b8b57 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -805,29 +805,26 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,8,12] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,4,8,12] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,5,9,13] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,5,9,13] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,2,6,10,14] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [2,6,10,14] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,3,7,11,15] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,11,15] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -841,29 +838,26 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,8,12] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,4,8,12] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,5,9,13] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,5,9,13] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,2,6,10,14] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [2,6,10,14] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,3,7,11,15] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,11,15] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -877,29 +871,26 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,8,12] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,4,8,12] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,5,9,13] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,5,9,13] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,2,6,10,14] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [2,6,10,14] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,3,7,11,15] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,11,15] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -913,29 +904,26 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,5,9,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [2,6,10,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -949,29 +937,26 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,8,12] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,4,8,12] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,5,9,13] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,5,9,13] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,2,6,10,14] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [2,6,10,14] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,3,7,11,15] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,11,15] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -985,29 +970,26 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,8,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,4,8,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,5,9,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,5,9,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,2,6,10,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [2,6,10,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,3,7,11,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1021,29 +1003,26 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,4,8,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,5,9,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [2,6,10,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1057,29 +1036,26 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) @@ -1752,497 +1728,473 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride4_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm12 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,8,12] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,4,8,12] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,1,5,9,13] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm12 = [1,5,9,13] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,6,10,14] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [2,6,10,14] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,7,11,15] +; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride4_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,7,11,15] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride4_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm10, %zmm12 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,8,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,4,8,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,1,5,9,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm12 = [1,5,9,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,6,10,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [2,6,10,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,7,11,15] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride4_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride4_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,8,12] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,4,8,12] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,1,5,9,13] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [1,5,9,13] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,6,10,14] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [2,6,10,14] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,7,11,15] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride4_vf16: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,7,11,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride4_vf16: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,4,8,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [1,5,9,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [2,6,10,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride4_vf16: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm13[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm14[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <64 x i64>, ptr %in.vec, align 64 @@ -3680,913 +3632,889 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride4_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,4,8,12] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,4,8,12] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-NEXT: vpermt2q %zmm4, %zmm20, %zmm15 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512-NEXT: vpermt2q %zmm13, %zmm20, %zmm18 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm21 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,1,5,9,13] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm22 = [1,5,9,13] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512-NEXT: vpermt2q %zmm13, %zmm22, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm25 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm21 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,2,6,10,14] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512-NEXT: vpermt2q %zmm10, %zmm22, %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm26 = [2,6,10,14] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512-NEXT: vpermt2q %zmm4, %zmm26, %zmm27 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512-NEXT: vpermt2q %zmm17, %zmm22, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512-NEXT: vpermt2q %zmm13, %zmm26, %zmm28 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm28 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512-NEXT: vpermt2q %zmm12, %zmm22, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm29 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm22 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm26 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,3,7,11,15] +; AVX512-NEXT: vpermt2q %zmm17, %zmm26, %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm17 = [3,7,11,15] +; AVX512-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm12, %zmm26, %zmm9 +; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm6 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm10, %zmm26, %zmm7 +; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride4_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm18 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm24 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm25 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm22, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm26 = [2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm27 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm22, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm28 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm28 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm29 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,3,7,11,15] +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [3,7,11,15] +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm6 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm26, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride4_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,4,8,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,4,8,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm20, %zmm15 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm20, %zmm18 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm19, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm20, %zmm21 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,1,5,9,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm22 = [1,5,9,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm22, %zmm24 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm21, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm22, %zmm25 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,2,6,10,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm22, %zmm25 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm26 = [2,6,10,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm26, %zmm27 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm22, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm26, %zmm28 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm24, %zmm28 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm22, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm26, %zmm29 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm26 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,3,7,11,15] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm26, %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm17 = [3,7,11,15] +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm26, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm17, %zmm6 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm26, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride4_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm18 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm24 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm25 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm22, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm26 = [2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm27 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm22, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm28 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm28 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm29 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm26 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm26, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride4_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,4,8,12] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,4,8,12] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm15 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm18 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm21 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,1,5,9,13] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm22 = [1,5,9,13] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm24 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm25 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,2,6,10,14] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm22, %zmm25 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm26 = [2,6,10,14] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm27 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm22, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm28 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm29 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm26 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,3,7,11,15] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm26, %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [3,7,11,15] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm6 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride4_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm15 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm18 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm24 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm25 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm22, %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm26 = [2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm27 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm22, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm28 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm29 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,3,7,11,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm26, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride4_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,4,8,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm15 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm18 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm22 = [1,5,9,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm24 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm26 = [2,6,10,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm27 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm22, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm28 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm28 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm29 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm26, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm6 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride4_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm18[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm21[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm23[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm25[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm22, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm26 = [2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm27[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm22, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm29[0,1,2,3],zmm28[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm26, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -7556,2369 +7484,2337 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride4_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] -; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,4,8,12] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,5,9,13] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,2,6,10,14] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-NEXT: vpermt2q %zmm4, %zmm27, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-NEXT: vpermt2q %zmm3, %zmm27, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm30 +; AVX512-NEXT: vpermi2q %zmm18, %zmm30, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512-NEXT: vpermi2q %zmm18, %zmm30, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512-NEXT: vpermi2q %zmm18, %zmm30, %zmm27 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,8,12] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,5,9,13] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm18 = [2,6,10,14] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm24 = [3,7,11,15] +; AVX512-NEXT: vpermt2q %zmm2, %zmm24, %zmm20 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512-NEXT: vpermt2q %zmm2, %zmm18, %zmm31 +; AVX512-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm9 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm8 +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm5 +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 64-byte Folded Reload +; AVX512-NEXT: # zmm23 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: # zmm15 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm6[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 64-byte Folded Reload +; AVX512-NEXT: # zmm24 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512-NEXT: # zmm29 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm27, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 64-byte Folded Reload +; AVX512-NEXT: # zmm31 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 64-byte Folded Reload +; AVX512-NEXT: # zmm26 = zmm26[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 64-byte Folded Reload +; AVX512-NEXT: # zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 64-byte Folded Reload +; AVX512-NEXT: # zmm16 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm27[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload +; AVX512-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 64-byte Folded Reload +; AVX512-NEXT: # zmm25 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm30[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm3, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm4, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm29, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm18, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm5, 384(%r8) +; AVX512-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm9, 256(%r8) +; AVX512-NEXT: vmovdqa64 %zmm14, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride4_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512-FCP-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm30 +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [3,7,11,15] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm5 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm23 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm15 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = zmm6[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm24 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm29 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-FCP-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm31 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm26 = zmm26[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm16 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm27[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm25 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm30[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 448(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 256(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 448(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 320(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 256(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 384(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 384(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 256(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 320(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512-FCP-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride4_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512DQ-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,4,8,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,5,9,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,2,6,10,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm27, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm30 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm30, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm30, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm30, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,8,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,5,9,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm18 = [2,6,10,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm24 = [3,7,11,15] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm24, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm24, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm18, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm18, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm24, %zmm5 +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm23 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm15 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = zmm6[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm24 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm29 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm31 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm26 = zmm26[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm16 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm27[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm25 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 448(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 448(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 256(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 320(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride4_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm23 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm15 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = zmm6[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm24 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm29 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-FCP-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm31 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm26 = zmm26[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm16 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm27[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm25 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 448(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 256(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 448(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 320(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 256(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 384(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 384(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 256(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 320(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-FCP-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride4_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512BW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,4,8,12] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,5,9,13] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,2,6,10,14] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm30 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm30, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm30, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm30, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,8,12] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,5,9,13] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [2,6,10,14] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [3,7,11,15] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm5 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm23 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm15 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm6[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm24 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm29 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm31 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm26 = zmm26[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm16 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm27[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm25 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm30[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride4_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512BW-FCP-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm30 +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm5 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm0 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm23 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm15 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm0 = zmm6[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm24 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm29 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-FCP-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm31 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm26 = zmm26[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm16 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm27[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm25 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm30[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 448(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 256(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 448(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 320(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 384(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 320(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-FCP-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride4_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512DQ-BW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm30 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm30, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm30, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm30, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,8,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,5,9,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [2,6,10,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm5 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm0 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm23 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm15 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm0 = zmm6[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm24 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm29 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm31 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm26 = zmm26[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm16 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm27[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm25 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 448(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 256(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 448(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 320(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 448(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 256(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 384(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 384(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 256(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 320(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-BW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride4_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm30, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm23 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm15 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm6[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm24 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm29 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm17 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm31 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm26 = zmm26[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm16 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm27[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm25 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 448(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 320(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 256(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 448(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 320(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 384(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 320(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index 07988a416bac4..ef7bf00d7f33d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -1085,464 +1085,432 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i64_stride5_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,12,1,6,0] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,5,10,15] +; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,6,11,0] +; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,5,10,15,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,6,11,0,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] +; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,2,7,12,0,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,0,5,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,11,0,5,0,0] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [12,1,6,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride5_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,12,1,6,0] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,5,10,15] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,6,11,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,5,10,15,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,6,11,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,2,7,12,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,0,5,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,11,0,5,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [12,1,6,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride5_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,12,1,6,0] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,5,10,15] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,6,11,0] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,5,10,15,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,6,11,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,2,7,12,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,0,5,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,11,0,5,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [12,1,6,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride5_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,12,1,6,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,6,11,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,5,10,15,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,6,11,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,2,7,12,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,11,0,5,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [12,1,6,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,12,1,6,0] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,5,10,15] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,6,11,0] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,5,10,15,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,6,11,0,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,2,7,12,0,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,0,5,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,11,0,5,0,0] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [12,1,6,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride5_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,12,1,6,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,6,11,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,5,10,15,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,6,11,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,2,7,12,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,11,0,5,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [12,1,6,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride5_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,12,1,6,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,6,11,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,5,10,15,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,6,11,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,2,7,12,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,11,0,5,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [12,1,6,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride5_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,12,1,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,6,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,5,10,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,6,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,2,7,12,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,11,0,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [12,1,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <40 x i64>, ptr %in.vec, align 64 @@ -2432,90 +2400,85 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,12,1,6,0] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 +; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,6,11,0] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,5,10,15,0] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] -; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 +; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,6,11,0,0] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm16 = [2,7,12,0] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] -; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] -; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 +; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} +; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,2,7,12,0,0] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,0,5,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 -; AVX512-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] -; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 +; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} +; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,11,0,5,0,0] +; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [12,1,6,0] +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 +; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: vzeroupper @@ -2529,90 +2492,85 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,12,1,6,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,6,11,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,5,10,15,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,6,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [2,7,12,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,2,7,12,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,0,5,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,11,0,5,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [12,1,6,0] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -2626,90 +2584,85 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,12,1,6,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,6,11,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,5,10,15,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,6,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm16 = [2,7,12,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,2,7,12,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,0,5,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,11,0,5,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [12,1,6,0] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-NEXT: vzeroupper @@ -2723,90 +2676,85 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,12,1,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,6,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,5,10,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,6,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,2,7,12,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,11,0,5,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [12,1,6,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -2820,90 +2768,85 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,12,1,6,0] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,6,11,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,5,10,15,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,6,11,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm16 = [2,7,12,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,2,7,12,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,0,5,0] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,11,0,5,0,0] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [12,1,6,0] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-NEXT: vzeroupper @@ -2917,90 +2860,85 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,12,1,6,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,6,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,5,10,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,6,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,2,7,12,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,11,0,5,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [12,1,6,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper @@ -3014,90 +2952,85 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,12,1,6,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,6,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,5,10,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,6,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm16 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,2,7,12,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,11,0,5,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [12,1,6,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper @@ -3111,90 +3044,85 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,12,1,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,6,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,5,10,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,6,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,2,7,12,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,11,0,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [12,1,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -5149,1609 +5077,1545 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride5_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,12,1,6,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 +; AVX512-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,5,10,15,0] +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,6,11,0,0] +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm15 = [2,7,12,0] +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,2,7,12,0,0] +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,11,0,5,0,0] +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 ; AVX512-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 -; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm21 = [11,0,5,0] +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [12,1,6,0] +; AVX512-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 +; AVX512-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm15, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm27, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm16, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm25, 128(%r9) ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride5_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512-FCP-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,12,1,6,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,5,10,15,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,6,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [2,7,12,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,2,7,12,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,11,0,5,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [11,0,5,0] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [12,1,6,0] +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512-FCP-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride5_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQ-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,12,1,6,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,5,10,15,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,6,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm15 = [2,7,12,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,2,7,12,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,11,0,5,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm21 = [11,0,5,0] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [12,1,6,0] +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 128(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512DQ-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512DQ-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride5_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQ-FCP-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,12,1,6,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,5,10,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,6,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,2,7,12,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,11,0,5,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [12,1,6,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512DQ-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512DQ-FCP-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,12,1,6,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,5,10,15,0] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,6,11,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [2,7,12,0] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,2,7,12,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,11,0,5,0,0] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm21 = [11,0,5,0] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [12,1,6,0] +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride5_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512BW-FCP-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,12,1,6,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,5,10,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,6,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,2,7,12,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,11,0,5,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [12,1,6,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 128(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512BW-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512BW-FCP-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride5_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQ-BW-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,12,1,6,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,5,10,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,6,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,2,7,12,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,11,0,5,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm21 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [12,1,6,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 128(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512DQ-BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512DQ-BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride5_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQ-BW-FCP-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,12,1,6,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,5,10,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,6,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,2,7,12,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,11,0,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [12,1,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 128(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <160 x i64>, ptr %in.vec, align 64 @@ -10911,291 +10775,287 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride5_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,12,1,6,0] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,5,10,15,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,6,11,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,2,7,12,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,11,0,5,0,0] +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm1 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,10,15] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,0,5,0] ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm25 = [12,1,6,0] +; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 +; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm22 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 +; AVX512-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -11203,467 +11063,462 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 -; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 448(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 384(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 320(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 256(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 448(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 256(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 320(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 192(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm4, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, (%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, (%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512-NEXT: vmovdqa64 %zmm6, 448(%r9) ; AVX512-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, 320(%r9) ; AVX512-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm23, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride5_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,12,1,6,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,5,10,15,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,6,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,2,7,12,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,11,0,5,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,10,15] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,0,5,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm25 = [12,1,6,0] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -11671,467 +11526,462 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 448(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 384(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 320(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 256(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 448(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 256(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 320(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 192(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, (%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 448(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, (%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 448(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 320(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride5_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,12,1,6,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,5,10,15,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,6,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,2,7,12,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,11,0,5,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,10,15] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,0,5,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm25 = [12,1,6,0] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -12139,467 +11989,462 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 448(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 384(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 320(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 256(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 448(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 320(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 192(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 448(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 320(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512DQ-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,12,1,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,5,10,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,6,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,2,7,12,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,11,0,5,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,0,5,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm25 = [12,1,6,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -12607,467 +12452,462 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 448(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 384(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 320(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 256(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 448(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 256(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 320(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 448(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 448(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 320(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512DQ-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,12,1,6,0] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,5,10,15,0] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,6,11,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,2,7,12,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,11,0,5,0,0] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,0,5,0] ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm25 = [12,1,6,0] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -13075,467 +12915,462 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 448(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 448(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride5_vf64: -; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 +; AVX512BW-FCP: # %bb.0: +; AVX512BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,12,1,6,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,5,10,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,6,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,2,7,12,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,11,0,5,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,0,5,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm25 = [12,1,6,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -13543,467 +13378,462 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 448(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 384(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 320(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 256(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 448(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 256(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 320(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 448(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 320(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512BW-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride5_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,12,1,6,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,5,10,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,6,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,2,7,12,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,11,0,5,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,0,5,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm25 = [12,1,6,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -14011,467 +13841,462 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 448(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 384(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 320(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 256(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 448(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 256(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 320(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 320(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 448(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 448(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 320(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-BW-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512DQ-BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride5_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,12,1,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,5,10,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,6,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,2,7,12,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,11,0,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [2,7,12,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,0,5,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm25 = [12,1,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -14479,177 +14304,176 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 448(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 384(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 320(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 256(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 448(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 256(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 320(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 384(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 448(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 320(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <320 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 7d3209397c3df..86381da9d2329 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -1357,84 +1357,73 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i64_stride6_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,10,0,6,0,0] +; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512-NEXT: movb $-64, %dil ; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,11,1,7,0,0] +; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,7,13,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,4,10,0,0,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [10,0,6,0] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512-NEXT: movb $24, %dil ; AVX512-NEXT: kmovw %edi, %k2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512-NEXT: movb $-32, %dil ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,11,0,0,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,1,7,0] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,6,12,0,0,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,10] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,11,1,7] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,7,13,0,0,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,11] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1442,84 +1431,73 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i64_stride6_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,10,0,6,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512-FCP-NEXT: movb $-64, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,11,1,7,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,7,13,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,4,10,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [10,0,6,0] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512-FCP-NEXT: movb $24, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512-FCP-NEXT: movb $-32, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,11,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,1,7,0] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,6,12,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,10] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,11,1,7] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,7,13,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,11] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1527,84 +1505,73 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i64_stride6_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,10,0,6,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512DQ-NEXT: movb $-64, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,11,1,7,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,7,13,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,4,10,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [10,0,6,0] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-NEXT: movb $24, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512DQ-NEXT: movb $-32, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,11,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,1,7,0] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,6,12,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,10] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,11,1,7] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,7,13,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,11] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1612,84 +1579,73 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i64_stride6_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,10,0,6,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: movb $-64, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,11,1,7,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,7,13,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,4,10,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: movb $24, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: movb $-32, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,11,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,6,12,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,11,1,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,7,13,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -1697,84 +1653,73 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,10,0,6,0,0] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,11,1,7,0,0] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,7,13,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,4,10,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [10,0,6,0] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: movb $24, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,11,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,1,7,0] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,6,12,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,10] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,11,1,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,7,13,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,11] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1782,84 +1727,73 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i64_stride6_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,10,0,6,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: movb $-64, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,11,1,7,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,7,13,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,4,10,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: movb $24, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: movb $-32, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,11,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,6,12,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,11,1,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,7,13,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1867,84 +1801,73 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-LABEL: load_i64_stride6_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,10,0,6,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: movb $-64, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,11,1,7,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,7,13,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,4,10,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: movb $24, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: movb $-32, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,11,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,6,12,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,10] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,11,1,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,7,13,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1952,84 +1875,73 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i64_stride6_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,10,0,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,11,1,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [1,7,13,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,4,10,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $24, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,11,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,1,7,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,6,12,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,11,1,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,7,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -3134,20 +3046,19 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i64_stride6_vf16: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,10,0,6,0,0] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] @@ -3156,120 +3067,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,4,10] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512-NEXT: vpermt2q %zmm8, %zmm15, %zmm16 ; AVX512-NEXT: movb $-64, %dil ; AVX512-NEXT: kmovw %edi, %k2 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,11,1,7,0,0] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm17 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,5,11] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm17 +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm16 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,4,10,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm16 = [10,0,6,0] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512-NEXT: vpermt2q %zmm9, %zmm16, %zmm17 +; AVX512-NEXT: movb $24, %dil +; AVX512-NEXT: kmovw %edi, %k2 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,0,6,12] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512-NEXT: movb $-32, %dil +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm19 +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,11,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,1,7,0] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512-NEXT: movb $24, %dil -; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,1,7,13] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm22 ; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: movb $-32, %dil -; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm21 +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,10,0,6] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,6,12,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 -; AVX512-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 -; AVX512-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm24 = [4,10] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 +; AVX512-NEXT: vinserti32x4 $0, %xmm25, %zmm23, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512-NEXT: vinserti32x4 $0, %xmm24, %zmm22, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,11,1,7] +; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,1,7,13,0,0,0] ; AVX512-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,11] +; AVX512-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vpermt2q %zmm5, %zmm20, %zmm2 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm16, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -3277,20 +3177,19 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i64_stride6_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,10,0,6,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] @@ -3299,120 +3198,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,4,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm16 ; AVX512-FCP-NEXT: movb $-64, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] -; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,11,1,7,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,5,11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm17 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm16 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,4,10,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [10,0,6,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm17 +; AVX512-FCP-NEXT: movb $24, %dil +; AVX512-FCP-NEXT: kmovw %edi, %k2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,0,6,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512-FCP-NEXT: movb $-32, %dil +; AVX512-FCP-NEXT: kmovw %edi, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,11,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,1,7,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512-FCP-NEXT: movb $24, %dil -; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,1,7,13] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512-FCP-NEXT: movb $-32, %dil -; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,10,0,6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,6,12,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm24 = [4,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm23, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm24, %zmm22, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,11,1,7] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,1,7,13,0,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,11] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -3420,20 +3308,19 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-LABEL: load_i64_stride6_vf16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,10,0,6,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] @@ -3442,120 +3329,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,4,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm15, %zmm16 ; AVX512DQ-NEXT: movb $-64, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,11,1,7,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm17 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,5,11] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm17, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm17 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm16 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,4,10,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm16 = [10,0,6,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm16, %zmm17 +; AVX512DQ-NEXT: movb $24, %dil +; AVX512DQ-NEXT: kmovw %edi, %k2 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,0,6,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512DQ-NEXT: movb $-32, %dil +; AVX512DQ-NEXT: kmovw %edi, %k1 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm19 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,11,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,1,7,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-NEXT: movb $24, %dil -; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,1,7,13] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512DQ-NEXT: movb $-32, %dil -; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,10,0,6] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,6,12,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm24 = [4,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm25, %zmm23, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm24, %zmm22, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,11,1,7] +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm20, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,1,7,13,0,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,11] +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm20, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -3563,20 +3439,19 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i64_stride6_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,10,0,6,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] @@ -3585,120 +3460,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,4,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: movb $-64, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,11,1,7,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,5,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm17 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm16 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,4,10,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm17 +; AVX512DQ-FCP-NEXT: movb $24, %dil +; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,0,6,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512DQ-FCP-NEXT: movb $-32, %dil +; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,11,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,1,7,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: movb $24, %dil -; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,1,7,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: movb $-32, %dil -; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,10,0,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,6,12,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm24 = [4,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm23, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm24, %zmm22, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,11,1,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,1,7,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -3706,20 +3570,19 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,10,0,6,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] @@ -3728,120 +3591,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,4,10] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm15, %zmm16 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,11,1,7,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm17 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,5,11] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,4,10,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm16 = [10,0,6,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm17 +; AVX512BW-NEXT: movb $24, %dil +; AVX512BW-NEXT: kmovd %edi, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,0,6,12] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512BW-NEXT: movb $-32, %dil +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,11,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,1,7,0] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-NEXT: movb $24, %dil -; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,1,7,13] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512BW-NEXT: movb $-32, %dil -; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,10,0,6] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,6,12,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm24 = [4,10] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm23, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm22, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,11,1,7] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,1,7,13,0,0,0] ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,11] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3849,20 +3701,19 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-LABEL: load_i64_stride6_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,10,0,6,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] @@ -3871,120 +3722,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,4,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: movb $-64, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,11,1,7,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,5,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm17 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm16 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,4,10,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm17 +; AVX512BW-FCP-NEXT: movb $24, %dil +; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,0,6,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512BW-FCP-NEXT: movb $-32, %dil +; AVX512BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,11,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,1,7,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: movb $24, %dil -; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,1,7,13] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: movb $-32, %dil -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,10,0,6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,6,12,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm24 = [4,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm23, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm24, %zmm22, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,11,1,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,1,7,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -3992,20 +3832,19 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-LABEL: load_i64_stride6_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,10,0,6,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] @@ -4014,120 +3853,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,4,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: movb $-64, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,11,1,7,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,5,11] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm17 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm16 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,4,10,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm16 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm17 +; AVX512DQ-BW-NEXT: movb $24, %dil +; AVX512DQ-BW-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,0,6,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512DQ-BW-NEXT: movb $-32, %dil +; AVX512DQ-BW-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,11,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,1,7,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: movb $24, %dil -; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,1,7,13] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: movb $-32, %dil -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,10,0,6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,6,12,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm24 = [4,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm25, %zmm23, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm24, %zmm22, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,11,1,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,1,7,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -4135,20 +3963,19 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-LABEL: load_i64_stride6_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,10,0,6,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] @@ -4157,120 +3984,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [4,10,4,10,4,10,4,10] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,4,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,11,1,7,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [5,11,5,11,5,11,5,11] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,5,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,4,10,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm17 +; AVX512DQ-BW-FCP-NEXT: movb $24, %dil +; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil +; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,11,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [11,1,7,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: movb $24, %dil -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,1,7,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,10,0,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,6,12,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,11,1,7,0,11,1,7] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [13,0,1,7,13,0,1,7] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm24 = [4,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm23, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm24, %zmm22, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,11,1,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,1,7,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -6750,351 +6566,332 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i64_stride6_vf32: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,10,0,6,0,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,4,10] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm0, %zmm21, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,5,11] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,11,1,7,0,0] +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,6,12] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,1,7,13] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,11,1,7] +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2q %zmm25, %zmm8, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512-NEXT: vpermi2q %zmm21, %zmm5, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,4,10,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm18, %zmm26, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,5,11,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,6,12,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512-NEXT: vpermi2q %zmm18, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-NEXT: vpermt2q %zmm23, %zmm7, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm18, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm2, %zmm18, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm2, %zmm18, %zmm29 +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm18 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [10,0,6,0] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512-NEXT: vpermt2q %zmm19, %zmm7, %zmm24 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm17 = [4,10] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm30 = [5,11] +; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm19 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-NEXT: vpermt2q %zmm4, %zmm7, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm28 +; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm4 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm11 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512-NEXT: vpermt2q %zmm9, %zmm17, %zmm25 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm9, %zmm30, %zmm5 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512-NEXT: movb $24, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512-NEXT: movb $-32, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm6 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm28, %zmm25, %zmm25 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm17, %zmm29, %zmm17 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm19, %zmm26, %zmm19 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm31, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm24, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -7102,351 +6899,332 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i64_stride6_vf32: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,10,0,6,0,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,4,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm21, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,5,11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,11,1,7,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,6,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,1,7,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,11,1,7] +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,4,10,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm26, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,5,11,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,6,12,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [10,0,6,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm7, %zmm24 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm17 = [4,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [5,11] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm25 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm5 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512-FCP-NEXT: movb $24, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512-FCP-NEXT: movb $-32, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm25, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm29, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm26, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm31, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512-FCP-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -7454,351 +7232,332 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-LABEL: load_i64_stride6_vf32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,10,0,6,0,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,4,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm21, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,5,11] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,11,1,7,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,6,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,1,7,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,11,1,7] +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm8, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm5, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,4,10,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm26, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,5,11,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,6,12,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm7, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm18, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm18, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [10,0,6,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm7, %zmm24 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm17 = [4,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm30 = [5,11] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm7, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm11 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm17, %zmm25 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm30, %zmm5 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-NEXT: movb $24, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-NEXT: movb $-32, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm6 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm28, %zmm25, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm17, %zmm29, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm19, %zmm26, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm31, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -7806,351 +7565,332 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i64_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,10,0,6,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,4,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm21, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,5,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,11,1,7,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,6,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,1,7,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,11,1,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,4,10,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,5,11,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,6,12,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm7, %zmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm17 = [4,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [5,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm25 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm5 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-FCP-NEXT: movb $24, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-FCP-NEXT: movb $-32, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm25, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm29, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm26, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm31, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-FCP-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -8158,351 +7898,332 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,10,0,6,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,4,10] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm21, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,5,11] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,11,1,7,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,6,12] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,1,7,13] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,11,1,7] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,4,10,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,5,11,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 -; AVX512BW-NEXT: movb $56, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,6,12,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: movb $24, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm18, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [10,0,6,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm24 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm17 = [4,10] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [5,11] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm17, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm30, %zmm5 +; AVX512BW-NEXT: movb $56, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm28, %zmm25, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm29, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm26, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -8510,351 +8231,332 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-LABEL: load_i64_stride6_vf32: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,10,0,6,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,4,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm21, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,5,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,11,1,7,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,6,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,1,7,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,11,1,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,4,10,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,5,11,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,6,12,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm7, %zmm24 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm17 = [4,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [5,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm5 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: movb $-32, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm25, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm29, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm26, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm31, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512BW-FCP-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -8862,351 +8564,332 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-LABEL: load_i64_stride6_vf32: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,10,0,6,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,4,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm21, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,5,11] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,11,1,7,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,6,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,1,7,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,11,1,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,4,10,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,5,11,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,6,12,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm18, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm18, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm24 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm17 = [4,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [5,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm17, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm30, %zmm5 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: movb $-32, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm6 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm28, %zmm25, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm29, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm19, %zmm26, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm31, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -9214,351 +8897,332 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-LABEL: load_i64_stride6_vf32: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,10,0,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,4,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm21, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,5,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,11,1,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,1,7,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,10,0,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,11,1,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,4,10,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,5,11,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,6,12,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm7, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm17 = [4,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [5,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $-32, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm25, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm29, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm26, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm31, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -14855,755 +14519,736 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i64_stride6_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,10] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,10,0,6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,11,1,7] +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm1 +; AVX512-NEXT: vpermi2q %zmm4, %zmm1, %zmm30 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,10,0,6,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,11,1,7,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,4,10,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,11,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,6,12,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] ; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512-NEXT: vpermt2q %zmm4, %zmm22, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512-NEXT: vpermi2q %zmm5, %zmm2, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-NEXT: vpermt2q %zmm4, %zmm28, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512-NEXT: vpermi2q %zmm5, %zmm2, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm30 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm25 = [5,11] +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm26 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm31 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm29, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm29 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512-NEXT: vpermt2q %zmm18, %zmm4, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm18 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-NEXT: vpermt2q %zmm12, %zmm3, %zmm11 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512-NEXT: vpermt2q %zmm9, %zmm4, %zmm13 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm15 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512-NEXT: vpermt2q %zmm12, %zmm8, %zmm5 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm30, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512-NEXT: movb $24, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512-NEXT: movb $-32, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, (%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, (%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm26, %zmm30, %zmm26 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm31, %zmm30, %zmm31 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm27, %zmm30, %zmm27 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm29, %zmm30, %zmm29 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm18, %zmm30, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm7, %zmm30, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm30, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm14, 256(%rsi) +; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, (%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, (%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 320(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, (%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm15, 384(%r8) +; AVX512-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm12, 320(%r9) +; AVX512-NEXT: vmovdqa64 %zmm17, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -15611,755 +15256,736 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i64_stride6_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,10,0,6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,11,1,7] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,10,0,6,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,11,1,7,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,4,10,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,11,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,6,12,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512-FCP-NEXT: movb $56, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: movb $-64, %al -; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm25 = [5,11] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm11 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm15 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 +; AVX512-FCP-NEXT: movb $56, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512-FCP-NEXT: movb $-64, %al +; AVX512-FCP-NEXT: kmovw %eax, %k2 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512-FCP-NEXT: movb $24, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512-FCP-NEXT: movb $-32, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, (%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, (%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, (%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm30, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm30, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm30, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm30, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm30, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm30, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm30, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 320(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 256(%rsi) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 384(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 448(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 448(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 320(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, (%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 384(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 320(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512-FCP-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -16367,755 +15993,736 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-LABEL: load_i64_stride6_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,10,0,6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,11,1,7] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm1, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,10,0,6,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,11,1,7,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,4,10,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,11,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,6,12,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm22, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm28, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm2, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512DQ-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm25 = [5,11] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm4, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm3, %zmm11 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm4, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm15 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm8, %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm30, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-NEXT: movb $24, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512DQ-NEXT: movb $-32, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, (%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, (%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, (%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm26, %zmm30, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm31, %zmm30, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm27, %zmm30, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm29, %zmm30, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm18, %zmm30, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm7, %zmm30, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm30, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 320(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rsi) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 448(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 320(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, (%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 384(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512DQ-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -17123,755 +16730,736 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i64_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,10,0,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,11,1,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,10,0,6,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,11,1,7,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,4,10,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,11,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,6,12,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm25 = [5,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-FCP-NEXT: movb $24, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512DQ-FCP-NEXT: movb $-32, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm30, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm30, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm30, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm30, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm30, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm30, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 320(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 256(%rsi) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 384(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 448(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 448(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 320(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 384(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 320(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512DQ-FCP-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -17879,755 +17467,736 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,10] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,10,0,6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,11,1,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,10,0,6,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,11,1,7,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,4,10,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,11,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,6,12,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm25 = [5,11] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm31 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm29 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm18 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm30, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm30, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm31, %zmm30, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm27, %zmm30, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm30, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm30, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm30, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 320(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512BW-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -18635,755 +18204,736 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-LABEL: load_i64_stride6_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,10,0,6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,11,1,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,10,0,6,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,11,1,7,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,4,10,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,11,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,6,12,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm25 = [5,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-FCP-NEXT: movb $-32, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 16-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, (%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, (%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm30, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm30, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm30, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm30, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm30, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm30, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm30, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 256(%rsi) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 384(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 448(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 448(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 320(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 320(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512BW-FCP-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -19391,755 +18941,736 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-LABEL: load_i64_stride6_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,10,0,6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,11,1,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,10,0,6,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,11,1,7,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,4,10,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,11,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,6,12,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512DQ-BW-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm25 = [5,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512DQ-BW-NEXT: movb $-32, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 16-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, (%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, (%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, (%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm26, %zmm30, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm31, %zmm30, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm27, %zmm30, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm29, %zmm30, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm18, %zmm30, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm7, %zmm30, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm30, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 320(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 256(%rsi) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 384(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 448(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 448(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 320(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 384(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 320(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512DQ-BW-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -20147,755 +19678,736 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-LABEL: load_i64_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,5,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,6,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,1,7,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,10,0,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,11,1,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,10,0,6,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,11,1,7,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,4,10,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,11,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,6,12,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,7,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,7,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm25 = [5,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $-32, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 16-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 384(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 384(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm30, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm30, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm30, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm30, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm30, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm30, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm30, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 256(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 384(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 448(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 448(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 320(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 320(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index cc3e5f3d1d82e..ea65208f175e0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -1618,873 +1618,761 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i64_stride7_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,7,14,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: movb $24, %r11b +; AVX512-NEXT: kmovw %r11d, %k2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[4,5,4,5],zmm3[4,5,4,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,3,10,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,9] +; AVX512-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512-NEXT: movb $-32, %r11b +; AVX512-NEXT: kmovw %r11d, %k1 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,11,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,10] +; AVX512-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,6,13,0,0,0] +; AVX512-NEXT: vpermi2q %zmm3, %zmm5, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [9,0,7,0] +; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,5,6,11] +; AVX512-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm9, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,6,13,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,5,6,12] +; AVX512-NEXT: vpermi2q %zmm4, %zmm10, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,9,0,7,0,0,0] +; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm10 +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm10, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,14,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,4,5,6,13] +; AVX512-NEXT: vpermi2q %zmm4, %zmm11, %zmm12 +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] -; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] -; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: movb $24, %r10b -; AVX512-NEXT: kmovw %r10d, %k2 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,9,0,7,0] +; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,4,5,6,14] +; AVX512-NEXT: vpermi2q %zmm4, %zmm12, %zmm14 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512-NEXT: movb $-32, %r10b -; AVX512-NEXT: kmovw %r10d, %k1 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rdi) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,2,9,0,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,4,5,8,15] +; AVX512-NEXT: vpermi2q %zmm4, %zmm13, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,11] +; AVX512-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm12, (%r10) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride7_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,7,14,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: movb $24, %r11b +; AVX512-FCP-NEXT: kmovw %r11d, %k2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[4,5,4,5],zmm3[4,5,4,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,3,10,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,9] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512-FCP-NEXT: movb $-32, %r11b +; AVX512-FCP-NEXT: kmovw %r11d, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,11,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,10] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,6,13,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [9,0,7,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,5,6,11] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,6,13,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,5,6,12] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,9,0,7,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,14,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,4,5,6,13] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] -; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] -; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-FCP-NEXT: movb $24, %r10b -; AVX512-FCP-NEXT: kmovw %r10d, %k2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512-FCP-NEXT: movb $-32, %r10b -; AVX512-FCP-NEXT: kmovw %r10d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,9,0,7,0] +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,4,5,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,2,9,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,4,5,8,15] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm13, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,11] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r10) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride7_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,7,14,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: movb $24, %r11b +; AVX512DQ-NEXT: kmovw %r11d, %k2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,3,10,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,9] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-NEXT: movb $-32, %r11b +; AVX512DQ-NEXT: kmovw %r11d, %k1 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,11,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,10] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,6,13,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm5, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [9,0,7,0] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,5,6,11] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm9, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,6,13,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,5,6,12] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm10, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,9,0,7,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm10 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm10, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,14,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,4,5,6,13] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm11, %zmm12 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] -; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] -; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQ-NEXT: movb $24, %r10b -; AVX512DQ-NEXT: kmovw %r10d, %k2 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512DQ-NEXT: movb $-32, %r10b -; AVX512DQ-NEXT: kmovw %r10d, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,9,0,7,0] +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,4,5,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm12, %zmm14 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rdi) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,2,9,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,4,5,8,15] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm13, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,11] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r10) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,7,14,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: movb $24, %r11b +; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,3,10,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: movb $-32, %r11b +; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,11,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,6,13,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [9,0,7,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,5,6,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,6,13,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,5,6,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,9,0,7,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,14,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,4,5,6,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQ-FCP-NEXT: movb $24, %r10b -; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: movb $-32, %r10b -; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,9,0,7,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,2,9,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,4,5,8,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm13, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,7,14,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: movb $24, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,3,10,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,9] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512BW-NEXT: movb $-32, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,11,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,10] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,6,13,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [9,0,7,0] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,5,6,11] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,6,13,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,5,6,12] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm10, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,9,0,7,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm10, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,14,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,4,5,6,13] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm12 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,9,0,7,0] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,4,5,6,14] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm14 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,2,9,0,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,4,5,8,15] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm13, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,11] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride7_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] -; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] -; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,7,14,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: movb $24, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,3,10,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,9] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: movb $-32, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,11,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,6,13,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [9,0,7,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,5,6,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,6,13,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,5,6,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,9,0,7,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,14,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,4,5,6,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,9,0,7,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,4,5,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,2,9,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,4,5,8,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm13, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride7_vf8: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] -; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] -; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,7,14,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: movb $24, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,3,10,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,9] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: movb $-32, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,11,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,10] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,6,13,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [9,0,7,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,5,6,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm11, %zmm9, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,6,13,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,5,6,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,9,0,7,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm12, %zmm10, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,14,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,4,5,6,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,9,0,7,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,4,5,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,2,9,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,4,5,8,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm13, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movb $24, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,3,10,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,9] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,4,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,6,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,6,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,6,13,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,9,0,7,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,2,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,4,5,8,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm13, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <56 x i64>, ptr %in.vec, align 64 @@ -3705,183 +3593,167 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm30 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm24 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,14,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512-NEXT: vpermt2q %zmm26, %zmm7, %zmm29 ; AVX512-NEXT: movb $24, %r11b ; AVX512-NEXT: kmovw %r11d, %k2 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm27[4,5,4,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,10,0] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,5,6,9] +; AVX512-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 ; AVX512-NEXT: movb $-32, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} +; AVX512-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[4,5,4,5],zmm25[4,5,4,5] ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512-NEXT: vpermt2q %zmm29, %zmm24, %zmm17 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512-NEXT: vpermt2q %zmm28, %zmm18, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,4,11,0] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,6,10] +; AVX512-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,6,13,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512-NEXT: vpermt2q %zmm27, %zmm21, %zmm22 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm16 = [9,0,7,0] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 -; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 +; AVX512-NEXT: vpermt2q %zmm28, %zmm20, %zmm18 +; AVX512-NEXT: vpermi2q %zmm25, %zmm3, %zmm21 +; AVX512-NEXT: vpermi2q %zmm2, %zmm24, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm16 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,12,0] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512-NEXT: vpermt2q %zmm29, %zmm25, %zmm19 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} -; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 -; AVX512-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 -; AVX512-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 +; AVX512-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,5,6,11] +; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,7,14,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm18 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm20 +; AVX512-NEXT: vpermt2q %zmm28, %zmm21, %zmm20 +; AVX512-NEXT: vpermi2q %zmm25, %zmm3, %zmm22 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} -; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 -; AVX512-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 -; AVX512-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 -; AVX512-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,5,6,12] +; AVX512-NEXT: vpermt2q %zmm14, %zmm22, %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm20, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm4 +; AVX512-NEXT: vpermi2q %zmm3, %zmm25, %zmm23 +; AVX512-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,14,0] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512-NEXT: vpermt2q %zmm15, %zmm4, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,4,5,6,13] +; AVX512-NEXT: vpermt2q %zmm14, %zmm23, %zmm5 +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 -; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 -; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm18 -; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 -; AVX512-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-NEXT: vpermt2q %zmm26, %zmm3, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm22 +; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm4 +; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX512-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm23 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,9,0,7,0] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 -; AVX512-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 +; AVX512-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,4,5,6,14] +; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 +; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 +; AVX512-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermi2q %zmm24, %zmm2, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,9,0,0] +; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,8,15] +; AVX512-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] -; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 +; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 -; AVX512-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512-NEXT: vpermt2q %zmm28, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm5 +; AVX512-NEXT: vpermt2q %zmm24, %zmm12, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm31, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm3, 64(%r10) ; AVX512-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) @@ -3892,183 +3764,167 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] -; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,14,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm29 ; AVX512-FCP-NEXT: movb $24, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm27[4,5,4,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,10,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,5,6,9] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 ; AVX512-FCP-NEXT: movb $-32, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[4,5,4,5],zmm25[4,5,4,5] ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm17 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,4,11,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,6,10] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,6,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [9,0,7,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm18 +; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm3, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,12,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm19 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm22, %zmm18 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm3, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 -; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,5,6,12] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm20, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm25, %zmm23 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,14,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,4,5,6,13] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm5 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm18 -; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm22 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,9,0,7,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,4,5,6,14] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,9,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,8,15] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) @@ -4079,183 +3935,167 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm24 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,14,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm7, %zmm29 ; AVX512DQ-NEXT: movb $24, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k2 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm27[4,5,4,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,10,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,5,6,9] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 ; AVX512DQ-NEXT: movb $-32, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[4,5,4,5],zmm25[4,5,4,5] ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm24, %zmm17 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm18, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,4,11,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,6,10] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,6,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm21, %zmm22 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm16 = [9,0,7,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm20, %zmm18 +; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm3, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,12,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm25, %zmm19 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,7,14,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm22, %zmm18 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm21, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm3, %zmm22 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,5,6,12] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm22, %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm20, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm22, %zmm4 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm25, %zmm23 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,14,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm4, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,4,5,6,13] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm23, %zmm5 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm18 -; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm3, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm22 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm4 +; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm23 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,9,0,7,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 -; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,4,5,6,14] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vpermi2q %zmm24, %zmm2, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,9,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm5, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,8,15] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm12, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%r10) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) @@ -4266,183 +4106,167 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,14,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm29 ; AVX512DQ-FCP-NEXT: movb $24, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm27[4,5,4,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,5,6,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: movb $-32, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[4,5,4,5],zmm25[4,5,4,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,4,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,6,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,6,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [9,0,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm3, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,12,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm22, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm3, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,5,6,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm20, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm25, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm23, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,14,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,4,5,6,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm18 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,9,0,7,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,9,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,8,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) @@ -4453,184 +4277,168 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm31 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,14,0] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm7, %zmm29 ; AVX512BW-NEXT: movb $24, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm28[4,5,4,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,10,0] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,5,6,9] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 ; AVX512BW-NEXT: movb $-32, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm14 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm24, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,4,11,0] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,6,10] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,6,13,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm22 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm16 = [9,0,7,0] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm20, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm3, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,12,0] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm19 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm4, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm27 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm27, %xmm27 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm26, %zmm25 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm5 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm18 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,7,14,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm18 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm25, %xmm25 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm20, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm22, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm26, %zmm23 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm21 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm23, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,14,0] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm22 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,4,5,6,13] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm22 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm5 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm23, %zmm4 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm23 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,9,0,7,0] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,4,5,6,14] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,2,9,0,0] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,8,15] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -4640,184 +4448,168 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm18, %zmm31 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,14,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm29 ; AVX512BW-FCP-NEXT: movb $24, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm28[4,5,4,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,10,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,5,6,9] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: movb $-32, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm24, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,4,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,6,10] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,6,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [9,0,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm25, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm22 -; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm4, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm27 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm26, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24 ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm18 -; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm20, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,12,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,5,6,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm25, %xmm25 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm20, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm23, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,14,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,4,5,6,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm22 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,9,0,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,4,5,6,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,2,9,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,8,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -4827,184 +4619,168 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm31 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,14,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm7, %zmm29 ; AVX512DQ-BW-NEXT: movb $24, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm28[4,5,4,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,10,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,5,6,9] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: movb $-32, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm14 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm24, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,4,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,6,10] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,6,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm16 = [9,0,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm4, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm27 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm27, %xmm27 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm26, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24 ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm18 -; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm20, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,12,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,5,6,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm25, %xmm25 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm25, %zmm20, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm26, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm21 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm21, %zmm23, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,14,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,4,5,6,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm22 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,9,0,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,4,5,6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm8 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,2,9,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,8,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r10) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -5014,184 +4790,168 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm18, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm29 ; AVX512DQ-BW-FCP-NEXT: movb $24, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm28[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,10,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,5,6,9] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm24, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,4,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,6,10] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,6,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm25, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm4, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm27 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm26, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm20, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm25, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm20, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm23, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,2,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,8,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -7773,3817 +7533,3365 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2728, %rsp # imm = 0xAA8 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm25 +; AVX512-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] -; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,3,10,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm23, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512-NEXT: vpermt2q %zmm24, %zmm4, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-NEXT: vpermt2q %zmm8, %zmm4, %zmm7 +; AVX512-NEXT: vpermi2q %zmm22, %zmm0, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,11,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-NEXT: vpermt2q %zmm8, %zmm4, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-NEXT: vpermt2q %zmm24, %zmm4, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-NEXT: vpermt2q %zmm23, %zmm4, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-NEXT: vpermt2q %zmm8, %zmm9, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512-NEXT: vpermt2q %zmm24, %zmm9, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512-NEXT: vpermt2q %zmm23, %zmm9, %zmm16 +; AVX512-NEXT: vpermi2q %zmm22, %zmm0, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,6,13,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-NEXT: vpermt2q %zmm8, %zmm9, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512-NEXT: vpermt2q %zmm24, %zmm9, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512-NEXT: vpermt2q %zmm23, %zmm9, %zmm21 +; AVX512-NEXT: vpermi2q %zmm22, %zmm0, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,7,14,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-NEXT: vpermt2q %zmm8, %zmm27, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-NEXT: vpermt2q %zmm23, %zmm27, %zmm9 +; AVX512-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,9,0,7,0] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,2,9,0,0] +; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512-NEXT: vpermt2q %zmm2, %zmm30, %zmm24 +; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm20 +; AVX512-NEXT: vpermi2q %zmm3, %zmm22, %zmm30 +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,9] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm10 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,10] +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm31 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,11] +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,12] +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,6,13] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm19 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm27 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,6,14] +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm20 ; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm28 +; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,8,15] +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,14,0] ; AVX512-NEXT: movb $24, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] -; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 -; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 -; AVX512-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 -; AVX512-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 -; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512-NEXT: vpermt2q %zmm13, %zmm15, %zmm20 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,6,13,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm31 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,14,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,9,0,7,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm4[4,5,4,5],zmm0[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm26 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 912(%rdi), %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 -; AVX512-NEXT: vmovdqa 1360(%rdi), %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 +; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm10 +; AVX512-NEXT: vpermi2q %zmm4, %zmm10, %zmm15 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm7[4,5,4,5],zmm0[4,5,4,5] +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512-NEXT: vpermt2q %zmm28, %zmm17, %zmm21 +; AVX512-NEXT: vpermi2q %zmm0, %zmm7, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512-NEXT: vpermt2q %zmm28, %zmm1, %zmm29 +; AVX512-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm24 +; AVX512-NEXT: vpermi2q %zmm7, %zmm0, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[4,5,4,5],zmm28[4,5,4,5] +; AVX512-NEXT: vpermt2q %zmm28, %zmm14, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 912(%rdi), %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512-NEXT: vinserti32x4 $0, %xmm0, %zmm29, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512-NEXT: vinserti32x4 $0, %xmm0, %zmm8, %zmm29 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm28 +; AVX512-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm23 ; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 -; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,11] +; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512-NEXT: vpermt2q %zmm13, %zmm8, %zmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vpermi2q %zmm4, %zmm10, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [9,0,7,0] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm6 = [5,12] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-NEXT: vpermt2q %zmm13, %zmm6, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm14 +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512-NEXT: vpermi2q %zmm10, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqa 960(%rdi), %ymm12 +; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm19 ; AVX512-NEXT: vmovdqa 512(%rdi), %ymm12 ; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512-NEXT: vpermi2q %zmm4, %zmm10, %zmm6 +; AVX512-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 +; AVX512-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $-32, %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512-NEXT: vmovdqa 640(%rdi), %ymm14 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm15 -; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm22 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm13 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm4, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm26, %zmm11 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm17 +; AVX512-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm18, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm5 {%k2} +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm18 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm14 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX512-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX512-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11 +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm14, %zmm8, %zmm14 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm28, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm23, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm31, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512-NEXT: vinsertf64x4 $0, %ymm6, %zmm10, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm28, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm12, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm4, 192(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm4, (%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm21, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm5, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, (%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovaps %zmm7, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512-NEXT: vmovaps %zmm8, (%rax) -; AVX512-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX512-NEXT: vmovaps %zmm14, 64(%rax) +; AVX512-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride7_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8 -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25 +; AVX512-FCP-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] -; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,3,10,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,11,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm16 +; AVX512-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,6,13,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,7,14,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,9,0,7,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,2,9,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,9] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,10] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,12] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,6,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,6,14] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,8,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,14,0] ; AVX512-FCP-NEXT: movb $24, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,6,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm31 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,9,0,7,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm4[4,5,4,5],zmm0[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 -; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm7[4,5,4,5],zmm0[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm24 +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[4,5,4,5],zmm28[4,5,4,5] +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm8, %zmm29 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm28 +; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm23 ; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,11] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [9,0,7,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [5,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm12, %xmm19 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm12 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $-32, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm13 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm26, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm17 +; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm18, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k2} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm18 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm14 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm14, %zmm8, %zmm14 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm6, %zmm10, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm4, 192(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm4, (%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovaps %zmm7, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512-FCP-NEXT: vmovaps %zmm8, (%rax) -; AVX512-FCP-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX512-FCP-NEXT: vmovaps %zmm14, 64(%rax) +; AVX512-FCP-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $2728, %rsp # imm = 0xAA8 -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm25 +; AVX512DQ-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] -; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,3,10,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm4, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm0, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,11,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm4, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm4, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm4, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm9, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm9, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm16 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,6,13,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm9, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm9, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,7,14,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm27, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm27, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,9,0,7,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,2,9,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm22, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,9] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,10] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,12] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,6,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm27 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,6,14] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm20 ; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm28 +; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,8,15] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,14,0] ; AVX512DQ-NEXT: movb $24, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 -; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm15, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,6,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm31 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,14,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,9,0,7,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm4[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 912(%rdi), %xmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 -; AVX512DQ-NEXT: vmovdqa 1360(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm10, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm7[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm17, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm1, %zmm29 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm18, %zmm24 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm0, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[4,5,4,5],zmm28[4,5,4,5] +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm14, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 912(%rdi), %xmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm0, %zmm29, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm0, %zmm8, %zmm29 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm28 +; AVX512DQ-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm23 ; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,11] +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm8, %zmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm10, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [9,0,7,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm6 = [5,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm6, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm12 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti32x4 $1, %ymm12, %xmm19 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm12 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm10, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $-32, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm22 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm13 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm4, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm14 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm15 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm26, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm17 +; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm18, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 {%k2} +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm18 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm8, %zmm12 +; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm14 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11 +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm14, %zmm8, %zmm14 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm6, %zmm10, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm4, 192(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm4, (%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovaps %zmm7, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512DQ-NEXT: vmovaps %zmm8, (%rax) -; AVX512DQ-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512DQ-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX512DQ-NEXT: vmovaps %zmm14, 64(%rax) +; AVX512DQ-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8 -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,3,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm16 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,6,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,7,14,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,9,0,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,2,9,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,9] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,5,6,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,6,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,4,5,8,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,14,0] ; AVX512DQ-FCP-NEXT: movb $24, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,6,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm31 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,9,0,7,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm4[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm7[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[4,5,4,5],zmm28[4,5,4,5] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm8, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm23 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [9,0,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [5,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm12, %xmm19 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm10, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $-32, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm26, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm14 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm14, %zmm8, %zmm14 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm6, %zmm10, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovaps %zmm7, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rax) -; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX512DQ-FCP-NEXT: vmovaps %zmm14, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512BW-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,3,10,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,11,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,5,12,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,7,14,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm4, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,9,0,7,0] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm30 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,2,9,0,0] +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm29, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,9] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,10] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,6,13] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,6,14] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,8,15] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,14,0] ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm21 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,6,13,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,9,0,7,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,0,4,11] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm22 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm15 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[4,5,4,5],zmm29[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm2 -; AVX512BW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm31 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm28 +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm9, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm27 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [9,0,7,0] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm24[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm24[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm25 ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm6 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm4 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm30, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm17 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm18, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm18 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm18 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm12 -; AVX512BW-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm13, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm2, %zmm19, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-NEXT: vmovaps %zmm9, (%rax) -; AVX512BW-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512BW-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-NEXT: vmovaps %zmm2, (%rax) +; AVX512BW-NEXT: vmovaps %zmm13, 64(%rax) +; AVX512BW-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride7_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,3,10,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm31 +; AVX512BW-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,5,12,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,7,14,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm22, %zmm4, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,9,0,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm30 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,2,9,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2q %zmm29, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,9] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,10] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,6,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,6,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,8,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,14,0] ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,6,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,9,0,7,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,0,4,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm31 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm8 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[4,5,4,5],zmm29[4,5,4,5] +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm28 +; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm27 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [9,0,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm24[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm24[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm25 ; AVX512BW-FCP-NEXT: movb $-32, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm20, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm30, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm17 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm18, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm18 +; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm2 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm12 -; AVX512BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 192(%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 128(%r9) -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm13, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm2, %zmm19, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%r9) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-FCP-NEXT: vmovaps %zmm9, (%rax) -; AVX512BW-FCP-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512BW-FCP-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm13, 64(%rax) +; AVX512BW-FCP-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride7_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,3,10,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm31 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,5,12,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,7,14,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm4, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,9,0,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm30 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,2,9,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm29, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,9] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,10] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,6,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,8,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,14,0] ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,6,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,9,0,7,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 912(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa 1360(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,0,4,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm31 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm8 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[4,5,4,5],zmm29[4,5,4,5] +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa 912(%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm28 +; AVX512DQ-BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm27 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [9,0,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm24[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm24[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm25 ; AVX512DQ-BW-NEXT: movb $-32, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 960(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa 960(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm20, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm12, %zmm30, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %ymm17 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm18, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %ymm18 +; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm12 -; AVX512DQ-BW-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 192(%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, (%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 64(%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 128(%r9) -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm13, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm2, %zmm19, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%r9) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQ-BW-NEXT: vmovaps %zmm9, (%rax) -; AVX512DQ-BW-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512DQ-BW-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm13, 64(%rax) +; AVX512DQ-BW-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,3,10,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,4,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,5,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,6,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm4, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,2,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm29, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,10] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,4,5,8,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,14,0] ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,6,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,9,0,7,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,0,4,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[4,5,4,5],zmm29[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm24[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm24[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm25 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm20, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm30, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm17 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm18, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 128(%r9) -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm13, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm2, %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm9, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm13, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <224 x i64>, ptr %in.vec, align 64 @@ -17247,4911 +16555,4623 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512-NEXT: subq $6216, %rsp # imm = 0x1848 +; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm19 ; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,14,0,0,0] ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,11] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,7,14,0] +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,4,5,6,13] +; AVX512-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 -; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 -; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 -; AVX512-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 -; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 -; AVX512-NEXT: vmovdqa 1920(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm22 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 -; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm28 -; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 -; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm14 -; AVX512-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm24 +; AVX512-NEXT: vpermt2q %zmm24, %zmm27, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 +; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm18, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm27, %zmm7 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512-NEXT: vpermt2q %zmm30, %zmm6, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm0 +; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm9 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm29 +; AVX512-NEXT: vpermt2q %zmm29, %zmm27, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 -; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 +; AVX512-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 2368(%rdi), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm10 +; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm23 +; AVX512-NEXT: vpermt2q %zmm23, %zmm27, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 1920(%rdi), %ymm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm6, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm15 +; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm26 +; AVX512-NEXT: vpermt2q %zmm26, %zmm27, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm15 +; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3,4,5],ymm15[6,7] +; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm9, %zmm4, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm15[4,5,6,7] +; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm4 +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm16 +; AVX512-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa 2880(%rdi), %ymm5 +; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,9,0,7,0] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,14] +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 64-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm19, %zmm5, %zmm15 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm21, %zmm5, %zmm3 +; AVX512-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm30, %zmm1, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512-NEXT: vpermt2q %zmm29, %zmm8, %zmm3 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512-NEXT: vpermt2q %zmm22, %zmm5, %zmm3 +; AVX512-NEXT: vpermt2q %zmm23, %zmm8, %zmm3 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2q %zmm12, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa 3328(%rdi), %ymm2 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm9, %zmm3, %zmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 +; AVX512-NEXT: vpermt2q %zmm16, %zmm8, %zmm5 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,3,10,0] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,0,4,11,0] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,5,12,0] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,6,13,0] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,2,9,0,0] +; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm15, %zmm21 +; AVX512-NEXT: vmovdqu64 %zmm21, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512-NEXT: vpermt2q %zmm18, %zmm28, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm18, %zmm15, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm9 -; AVX512-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 +; AVX512-NEXT: vpermt2q %zmm17, %zmm25, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512-NEXT: vpermt2q %zmm17, %zmm28, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512-NEXT: vpermt2q %zmm17, %zmm2, %zmm27 +; AVX512-NEXT: vpermt2q %zmm17, %zmm15, %zmm30 ; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512-NEXT: vpermt2q %zmm11, %zmm25, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512-NEXT: vpermt2q %zmm11, %zmm28, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm21 +; AVX512-NEXT: vpermt2q %zmm11, %zmm15, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm28, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm22, %zmm25, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-NEXT: vpermt2q %zmm22, %zmm28, %zmm4 +; AVX512-NEXT: vpermi2q %zmm6, %zmm30, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512-NEXT: vpermt2q %zmm22, %zmm5, %zmm8 +; AVX512-NEXT: vpermi2q %zmm6, %zmm30, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm2, %zmm10 +; AVX512-NEXT: vpermi2q %zmm6, %zmm30, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm30 +; AVX512-NEXT: vpermt2q %zmm22, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 -; AVX512-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 -; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,9] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,10] +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm31 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm28 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,11] +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm26 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,12] +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm27 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,4,5,8,15] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm29 +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm30 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $24, %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm16 = [0,7,14,0] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] -; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 +; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm27 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,6,13,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,9,0,7,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11] +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm23 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm31 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm8, %zmm16, %zmm21 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm19 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] -; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm18, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm16, %zmm13 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm15, %zmm1, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm3[4,5,4,5],zmm4[4,5,4,5] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm26 +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm6[4,5,4,5] +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [9,0,7,0] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,13] +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: movb $-32, %al -; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm9 +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-NEXT: vpermt2q %zmm7, %zmm3, %zmm8 +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-NEXT: vpermt2q %zmm11, %zmm3, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm11, %zmm15, %zmm3 +; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 +; AVX512-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: movb $-32, %al +; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm8 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm7 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa 2752(%rdi), %ymm3 -; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa 960(%rdi), %ymm8 -; AVX512-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 -; AVX512-NEXT: vmovdqa 2304(%rdi), %ymm10 -; AVX512-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX512-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqa 1856(%rdi), %ymm12 -; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 -; AVX512-NEXT: vmovdqa 3200(%rdi), %ymm13 -; AVX512-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm31, 448(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm13, 384(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm13, 320(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm13, 256(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm13, 192(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm13, 64(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm13, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2} ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm27, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm12, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 448(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 256(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 320(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 128(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, (%r9) +; AVX512-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm20 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX512-NEXT: # ymm15 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 2304(%rdi), %ymm2 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 {%k2} +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX512-NEXT: # ymm15 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa 1856(%rdi), %ymm11 +; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm24, %zmm11 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm11 {%k2} +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm15, %zmm24, %zmm24 +; AVX512-NEXT: vmovdqa 3200(%rdi), %ymm15 +; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm26, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm16, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm13, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rdx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512-NEXT: vmovaps %zmm3, 448(%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 384(%r9) +; AVX512-NEXT: vmovaps %zmm3, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm30, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm28, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, 256(%r8) +; AVX512-NEXT: vmovdqa64 %zmm2, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm20, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm5, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm25, 384(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, (%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 448(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 256(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 320(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 192(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 384(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, (%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovaps %zmm11, 384(%rax) -; AVX512-NEXT: vmovaps %zmm4, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512-NEXT: vmovaps %zmm24, 384(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, (%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512-FCP-NEXT: subq $6216, %rsp # imm = 0x1848 +; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,14,0,0,0] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,7,14,0] +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,4,5,6,13] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 -; AVX512-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 1920(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm27, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 +; AVX512-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 2368(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 1920(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm27, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm15 +; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3,4,5],ymm15[6,7] +; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm15[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm4 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 2880(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,9,0,7,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,14] +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm15 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,3,10,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,0,4,11,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,5,12,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,6,13,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,2,9,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm28, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm30, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm30, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm30, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,9] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,10] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm28 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,12] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,4,5,8,15] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: movb $24, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [0,7,14,0] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,6,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,9,0,7,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11] +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm31 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm1, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm3[4,5,4,5],zmm4[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm6[4,5,4,5] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [9,0,7,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,13] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: movb $24, %al -; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: movb $-32, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 2304(%rdi), %ymm10 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 1856(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 -; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm13, 384(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm13, 320(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm13, 256(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm13, 192(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm13, 64(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm13, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm24, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2} ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 320(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 448(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 448(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 256(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 320(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 128(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, (%r9) +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm15 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 2304(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 {%k2} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm15 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 1856(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm24, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 {%k2} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm15, %zmm24, %zmm24 +; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm26, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 448(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 384(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 320(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 256(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rdx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512-FCP-NEXT: vmovaps %zmm3, 448(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 384(%r9) +; AVX512-FCP-NEXT: vmovaps %zmm3, 256(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 384(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 448(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 256(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 384(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 448(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 256(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 320(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 384(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovaps %zmm11, 384(%rax) -; AVX512-FCP-NEXT: vmovaps %zmm4, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm24, 384(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512-FCP-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512DQ-NEXT: subq $6216, %rsp # imm = 0x1848 +; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,14,0,0,0] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512DQ-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,11] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,7,14,0] +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,4,5,6,13] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 -; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 -; AVX512DQ-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 -; AVX512DQ-NEXT: vmovdqa 1920(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 -; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 +; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm27, %zmm7 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm6, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm27, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 +; AVX512DQ-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 2368(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm5, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm27, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 1920(%rdi), %ymm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm6, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm27, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm15 +; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3,4,5],ymm15[6,7] +; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm4, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm4 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa 2880(%rdi), %ymm5 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,9,0,7,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,14] +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm5, %zmm15 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm8, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm8, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa 3328(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm8, %zmm5 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,3,10,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,0,4,11,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,5,12,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,6,13,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,2,9,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm15, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm28, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm15, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm25, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm28, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm2, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm15, %zmm30 ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm25, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm28, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm15, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm28, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm25, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm28, %zmm4 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm30, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm30, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,9] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,10] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm28 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,12] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 -; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,4,5,8,15] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $24, %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm16 = [0,7,14,0] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,6,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,9,0,7,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11] +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm31 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm16, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm19 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm16, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm1, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm3[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm6[4,5,4,5] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [9,0,7,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,13] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: movb $-32, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm3, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: movb $-32, %al +; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 2752(%rdi), %ymm3 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm8 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 -; AVX512DQ-NEXT: vmovdqa 2304(%rdi), %ymm10 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 1856(%rdi), %ymm12 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 -; AVX512DQ-NEXT: vmovdqa 3200(%rdi), %ymm13 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 448(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm13, 384(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm13, 320(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm13, 256(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm13, 192(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm13, 64(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm13, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm24, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2} ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 320(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 448(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 256(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 320(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 128(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, (%r9) +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm15 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 2304(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm15 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 1856(%rdi), %ymm11 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm24, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm11 {%k2} +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm15, %zmm24, %zmm24 +; AVX512DQ-NEXT: vmovdqa 3200(%rdi), %ymm15 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm26, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 448(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 384(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 320(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 256(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512DQ-NEXT: vmovaps %zmm3, 448(%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 384(%r9) +; AVX512DQ-NEXT: vmovaps %zmm3, 256(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 384(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 448(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 384(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 448(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 256(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 320(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 384(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovaps %zmm11, 384(%rax) -; AVX512DQ-NEXT: vmovaps %zmm4, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-NEXT: vmovaps %zmm24, 384(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: subq $6216, %rsp # imm = 0x1848 +; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,14,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,7,14,0] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,4,5,6,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 1920(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm27, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 2368(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 1920(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm27, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3,4,5],ymm15[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 2880(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,9,0,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,3,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,0,4,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,5,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,6,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,2,9,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm28, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm30, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm30, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,9] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,4,5,8,15] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $24, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm16 = [0,7,14,0] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,6,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,9,0,7,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm31 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm19 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm1, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm3[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm6[4,5,4,5] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [9,0,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: movb $-32, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: movb $-32, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 2304(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 1856(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 384(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 320(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 256(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm13, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm24, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2} ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 320(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 384(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 448(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 448(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 256(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 320(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%r9) +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm15 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 2304(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm15 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 1856(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm24, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 {%k2} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm15, %zmm24, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm26, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 448(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 384(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 320(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 256(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 448(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 384(%r9) +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 256(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 384(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 448(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 256(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 320(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 384(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm24, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-FCP-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $7624, %rsp # imm = 0x1DC8 +; AVX512BW-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,14,0,0,0] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [4,11] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,0,7,14,0] +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,13] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm14 +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm0 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm6 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,9,0,7,0] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,14] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm8, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm2 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm2 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm28, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm6 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,3,10,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,4,11,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,5,12,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,6,13,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,2,9,0,0] +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm22, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm8, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm22, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm5, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,9] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,10] +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,4,5,8,15] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm28 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,7,14,0] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,6,13,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm29 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm3[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [9,0,7,0] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm0 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm5 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa 2304(%rdi), %ymm6 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa 1856(%rdi), %ymm7 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm18 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm15, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm19 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm15, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm20 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm15, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm21 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm15, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm22 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm23, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm12, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) @@ -22189,948 +21209,907 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512BW-NEXT: addq $6280, %rsp # imm = 0x1888 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8 +; AVX512BW-FCP-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,14,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [4,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,0,7,14,0] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,9,0,7,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,14] +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] -; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm28, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,3,10,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,4,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,5,12,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,6,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,2,9,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm22, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm22, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29 -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm5, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm0, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,9] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,10] +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,4,5,8,15] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,7,14,0] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] -; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,6,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm3[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [9,0,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $-32, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 2304(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 1856(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, 384(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, 320(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, 256(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm18 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %ymm19 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm15, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %ymm20 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm15, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm15, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %ymm22 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm22, %xmm22 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm23, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 384(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 320(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 256(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm12, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm3, 448(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 320(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 384(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 448(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%r9) @@ -23168,948 +22147,907 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512BW-FCP-NEXT: addq $6280, %rsp # imm = 0x1888 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride7_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-BW-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,14,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 2816(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [4,11] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,0,7,14,0] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 2368(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa 2368(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 1920(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa 1920(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm14 +; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 2880(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 2432(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqa 2880(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm6 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,9,0,7,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,14] +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 1984(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 3328(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm28, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,3,10,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,4,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,5,12,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,6,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,2,9,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm29 -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm8, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm22, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm5, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,9] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,10] +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,4,5,8,15] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,7,14,0] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,6,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 -; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm3[4,5,4,5],zmm6[4,5,4,5] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [9,0,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm13, %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $-32, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 2752(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 960(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 2304(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 1856(%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 3200(%rdi), %ymm8 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, 384(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, 320(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, 256(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %ymm18 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm18, %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm18 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %ymm19 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm19, %zmm15, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %ymm20 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm20, %zmm15, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %ymm21 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm21, %zmm15, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %ymm22 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm22, %zmm23, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm22 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 448(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 384(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 320(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 256(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rsi) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm12, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 384(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 448(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 256(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 320(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 384(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 448(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 256(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 320(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm3, 448(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 256(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 320(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 384(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 448(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 320(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 384(%r8) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%r9) @@ -24147,948 +23085,907 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-BW-NEXT: addq $6280, %rsp # imm = 0x1888 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-BW-FCP-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,14,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm28, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,3,10,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,4,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,5,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,6,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,2,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm22, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm22, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm5, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,10] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,4,5,8,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,6,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,9,0,7,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm3[4,5,4,5],zmm6[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm13, %zmm16, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $-32, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2304(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1856(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 384(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 320(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 256(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm15, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm15, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm15, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm22, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 384(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 320(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 256(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm3, 448(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 320(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 384(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 448(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%r9) @@ -25126,18 +24023,21 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-BW-FCP-NEXT: addq $6280, %rsp # imm = 0x1888 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <448 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index 51b6222077f82..f89d4c6aaa9d4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -1796,105 +1796,99 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,0,8] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm9 ; AVX512-NEXT: movb $-64, %bl ; AVX512-NEXT: kmovw %ebx, %k1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 -; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 -; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 -; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm11 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,1,9] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,1,9,0,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,0,2,10] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 -; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,4,12] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm6[0],zmm4[0],zmm6[2],zmm4[2],zmm6[4],zmm4[4],zmm6[6],zmm4[6] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,4,12] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm6[1],zmm4[1],zmm6[3],zmm4[3],zmm6[5],zmm4[5],zmm6[7],zmm4[7] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,5,13] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,13] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm15 = [6,14] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm14 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,7,15] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,15] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512-NEXT: vmovdqa64 %zmm12, (%r11) ; AVX512-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) @@ -1908,105 +1902,99 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,0,8] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm9 ; AVX512-FCP-NEXT: movb $-64, %bl ; AVX512-FCP-NEXT: kmovw %ebx, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 -; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm11 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,4,12] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm6[0],zmm4[0],zmm6[2],zmm4[2],zmm6[4],zmm4[4],zmm6[6],zmm4[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,4,12] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm6[1],zmm4[1],zmm6[3],zmm4[3],zmm6[5],zmm4[5],zmm6[7],zmm4[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,5,13] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,13] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm15 = [6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm14 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,15] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r11) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) @@ -2020,105 +2008,99 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,0,8] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm9 ; AVX512DQ-NEXT: movb $-64, %bl ; AVX512DQ-NEXT: kmovw %ebx, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 -; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm11 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,1,9] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,1,9,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,0,2,10] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,4,12] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm6[0],zmm4[0],zmm6[2],zmm4[2],zmm6[4],zmm4[4],zmm6[6],zmm4[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,4,12] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm6[1],zmm4[1],zmm6[3],zmm4[3],zmm6[5],zmm4[5],zmm6[7],zmm4[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,5,13] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,13] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm15 = [6,14] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm14 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,7,15] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,15] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r11) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) @@ -2132,105 +2114,99 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,0,8] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm9 ; AVX512DQ-FCP-NEXT: movb $-64, %bl ; AVX512DQ-FCP-NEXT: kmovw %ebx, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm11 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm6[0],zmm4[0],zmm6[2],zmm4[2],zmm6[4],zmm4[4],zmm6[6],zmm4[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm6[1],zmm4[1],zmm6[3],zmm4[3],zmm6[5],zmm4[5],zmm6[7],zmm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm15 = [6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm14 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r11) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) @@ -2244,105 +2220,99 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,0,8] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm9 ; AVX512BW-NEXT: movb $-64, %bl ; AVX512BW-NEXT: kmovd %ebx, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,1,9] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,1,9,0,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,0,2,10] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,4,12] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm6[0],zmm4[0],zmm6[2],zmm4[2],zmm6[4],zmm4[4],zmm6[6],zmm4[6] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,4,12] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm6[1],zmm4[1],zmm6[3],zmm4[3],zmm6[5],zmm4[5],zmm6[7],zmm4[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,5,13] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,13] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm15 = [6,14] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,7,15] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,15] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r11) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) @@ -2356,105 +2326,99 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,0,8] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm9 ; AVX512BW-FCP-NEXT: movb $-64, %bl ; AVX512BW-FCP-NEXT: kmovd %ebx, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm11 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm6[0],zmm4[0],zmm6[2],zmm4[2],zmm6[4],zmm4[4],zmm6[6],zmm4[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm6[1],zmm4[1],zmm6[3],zmm4[3],zmm6[5],zmm4[5],zmm6[7],zmm4[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm15 = [6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r11) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) @@ -2468,105 +2432,99 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm9 ; AVX512DQ-BW-NEXT: movb $-64, %bl ; AVX512DQ-BW-NEXT: kmovd %ebx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 -; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm11 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm6[0],zmm4[0],zmm6[2],zmm4[2],zmm6[4],zmm4[4],zmm6[6],zmm4[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm6[1],zmm4[1],zmm6[3],zmm4[3],zmm6[5],zmm4[5],zmm6[7],zmm4[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm15 = [6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r11) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) @@ -2580,105 +2538,99 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %bl ; AVX512DQ-BW-FCP-NEXT: kmovd %ebx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm6[0],zmm4[0],zmm6[2],zmm4[2],zmm6[4],zmm4[4],zmm6[6],zmm4[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm6[1],zmm4[1],zmm6[3],zmm4[3],zmm6[5],zmm4[5],zmm6[7],zmm4[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm15 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r11) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) @@ -4040,1785 +3992,1633 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride8_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512-NEXT: vmovaps (%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,0,8] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512-NEXT: vpermt2q %zmm9, %zmm17, %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512-NEXT: vpermt2q %zmm12, %zmm14, %zmm18 ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512-NEXT: vinserti32x4 $1, 192(%rdi), %ymm19, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm26, %zmm7, %zmm17 +; AVX512-NEXT: vpermi2q %zmm11, %zmm21, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,2,10] +; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm20[0],ymm2[0],ymm20[2],ymm2[2] ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 -; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512-NEXT: vmovdqa 640(%rdi), %ymm12 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 -; AVX512-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 -; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 -; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm17 +; AVX512-NEXT: vpermi2q %zmm26, %zmm7, %zmm18 +; AVX512-NEXT: vpermi2q %zmm11, %zmm21, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm27 +; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm28 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,3,11] +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512-NEXT: vpermt2q %zmm12, %zmm6, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,1,9,0,0] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm2[1],ymm20[3],ymm2[3] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm22 +; AVX512-NEXT: vpermi2q %zmm11, %zmm21, %zmm16 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512-NEXT: vpermi2q %zmm11, %zmm21, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,4,12] +; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm23 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm27 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm21[0],zmm11[0],zmm21[2],zmm11[2],zmm21[4],zmm11[4],zmm21[6],zmm11[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm29 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512-NEXT: vpermt2q %zmm12, %zmm23, %zmm24 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm21[1],zmm11[1],zmm21[3],zmm11[3],zmm21[5],zmm11[5],zmm21[7],zmm11[7] +; AVX512-NEXT: vpermi2q %zmm11, %zmm21, %zmm23 +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-NEXT: vpermi2q %zmm5, %zmm12, %zmm13 +; AVX512-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm15[0],zmm9[0],zmm15[2],zmm9[2],zmm15[4],zmm9[4],zmm15[6],zmm9[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm9[1],zmm15[3],zmm9[3],zmm15[5],zmm9[5],zmm15[7],zmm9[7] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 -; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 -; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 -; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] -; AVX512-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,1,9] +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,13] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512-NEXT: vpermt2q %zmm31, %zmm2, %zmm15 +; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm29, %zmm29 +; AVX512-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm26[0],zmm7[2],zmm26[2],zmm7[4],zmm26[4],zmm7[6],zmm26[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm7[1],zmm26[1],zmm7[3],zmm26[3],zmm7[5],zmm26[5],zmm7[7],zmm26[7] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,7,15] +; AVX512-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512-NEXT: vpermt2q %zmm31, %zmm8, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm7 = [7,15] +; AVX512-NEXT: vpermt2q %zmm31, %zmm7, %zmm10 +; AVX512-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 +; AVX512-NEXT: vpermi2q %zmm1, %zmm11, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm25, %zmm14, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512-NEXT: # ymm6 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm7, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm28, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: popq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride8_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512-FCP-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512-FCP-NEXT: pushq %rax +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,0,8] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm18 ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512-FCP-NEXT: vinserti32x4 $1, 192(%rdi), %ymm19, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm17 +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm20[0],ymm2[0],ymm20[2],ymm2[2] ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm17 +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm18 +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm28 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,3,11] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm2[1],ymm20[3],ymm2[3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm22 +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm16 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,4,12] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm23 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm21[0],zmm11[0],zmm21[2],zmm11[2],zmm21[4],zmm11[4],zmm21[6],zmm11[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm29 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm24 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm21[1],zmm11[1],zmm21[3],zmm11[3],zmm21[5],zmm11[5],zmm21[7],zmm11[7] +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm15[0],zmm9[0],zmm15[2],zmm9[2],zmm15[4],zmm9[4],zmm15[6],zmm9[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm9[1],zmm15[3],zmm9[3],zmm15[5],zmm9[5],zmm15[7],zmm9[7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] -; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm29, %zmm29 +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm26[0],zmm7[2],zmm26[2],zmm7[4],zmm26[4],zmm7[6],zmm26[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm7[1],zmm26[1],zmm7[3],zmm26[3],zmm7[5],zmm26[5],zmm7[7],zmm26[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm6 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm7, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-FCP-NEXT: popq %rax ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride8_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512DQ-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovaps (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512DQ-NEXT: pushq %rax +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,0,8] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm17, %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm14, %zmm18 ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512DQ-NEXT: vinserti32x4 $1, 192(%rdi), %ymm19, %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm7, %zmm17 +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm21, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,2,10] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm19, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm20[0],ymm2[0],ymm20[2],ymm2[2] ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm17 +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm7, %zmm18 +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm21, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm27 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm28 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm18 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,3,11] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm6, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,1,9,0,0] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm2[1],ymm20[3],ymm2[3] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm22 +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm21, %zmm16 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm21, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,4,12] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm23 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm27 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm21[0],zmm11[0],zmm21[2],zmm11[2],zmm21[4],zmm11[4],zmm21[6],zmm11[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm29 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm23, %zmm24 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm21[1],zmm11[1],zmm21[3],zmm11[3],zmm21[5],zmm11[5],zmm21[7],zmm11[7] +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm21, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm12, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm15[0],zmm9[0],zmm15[2],zmm9[2],zmm15[4],zmm9[4],zmm15[6],zmm9[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm9[1],zmm15[3],zmm9[3],zmm15[5],zmm9[5],zmm15[7],zmm9[7] ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,1,9] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm2, %zmm15 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm29, %zmm29 +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm26[0],zmm7[2],zmm26[2],zmm7[4],zmm26[4],zmm7[6],zmm26[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm7[1],zmm26[1],zmm7[3],zmm26[3],zmm7[5],zmm26[5],zmm7[7],zmm26[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,7,15] +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm8, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm7 = [7,15] +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm7, %zmm10 +; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm11, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm25, %zmm14, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm6 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512DQ-FCP-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: pushq %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,0,8] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm18 ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 192(%rdi), %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm17 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm20[0],ymm2[0],ymm20[2],ymm2[2] ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm17 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,3,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm2[1],ymm20[3],ymm2[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm16 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm23 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm21[0],zmm11[0],zmm21[2],zmm11[2],zmm21[4],zmm11[4],zmm21[6],zmm11[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm29 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm21[1],zmm11[1],zmm21[3],zmm11[3],zmm21[5],zmm11[5],zmm21[7],zmm11[7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm15[0],zmm9[0],zmm15[2],zmm9[2],zmm15[4],zmm9[4],zmm15[6],zmm9[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm9[1],zmm15[3],zmm9[3],zmm15[5],zmm9[5],zmm15[7],zmm9[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] -; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm29, %zmm29 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm26[0],zmm7[2],zmm26[2],zmm7[4],zmm26[4],zmm7[6],zmm26[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm7[1],zmm26[1],zmm7[3],zmm26[3],zmm7[5],zmm26[5],zmm7[7],zmm26[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm6 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: popq %rax ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512BW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512BW-NEXT: pushq %rax +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,0,8] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm17, %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm18 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm19, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,2,10] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm20[0],ymm2[0],ymm20[2],ymm2[2] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm27 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,3,11] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,1,9,0,0] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm2[1],ymm20[3],ymm2[3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,4,12] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm21[0],zmm11[0],zmm21[2],zmm11[2],zmm21[4],zmm11[4],zmm21[6],zmm11[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm29 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm24 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm21[1],zmm11[1],zmm21[3],zmm11[3],zmm21[5],zmm11[5],zmm21[7],zmm11[7] +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm15[0],zmm9[0],zmm15[2],zmm9[2],zmm15[4],zmm9[4],zmm15[6],zmm9[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm9[1],zmm15[3],zmm9[3],zmm15[5],zmm9[5],zmm15[7],zmm9[7] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,1,9] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm29, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm26[0],zmm7[2],zmm26[2],zmm7[4],zmm26[4],zmm7[6],zmm26[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm7[1],zmm26[1],zmm7[3],zmm26[3],zmm7[5],zmm26[5],zmm7[7],zmm26[7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,7,15] +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [7,15] +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm25, %zmm14, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512BW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride8_vf16: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512BW-FCP-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: pushq %rax +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,0,8] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm18 ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 192(%rdi), %ymm19, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm17 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm20[0],ymm2[0],ymm20[2],ymm2[2] ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm12 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm17 +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm27 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm28 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,3,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm2[1],ymm20[3],ymm2[3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm16 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm23 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm21[0],zmm11[0],zmm21[2],zmm11[2],zmm21[4],zmm11[4],zmm21[6],zmm11[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm29 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm24 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm21[1],zmm11[1],zmm21[3],zmm11[3],zmm21[5],zmm11[5],zmm21[7],zmm11[7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm15[0],zmm9[0],zmm15[2],zmm9[2],zmm15[4],zmm9[4],zmm15[6],zmm9[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm9[1],zmm15[3],zmm9[3],zmm15[5],zmm9[5],zmm15[7],zmm9[7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] -; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm29, %zmm29 +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm26[0],zmm7[2],zmm26[2],zmm7[4],zmm26[4],zmm7[6],zmm26[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm7[1],zmm26[1],zmm7[3],zmm26[3],zmm7[5],zmm26[5],zmm7[7],zmm26[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm6 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm7, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512BW-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: popq %rax ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride8_vf16: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512DQ-BW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: pushq %rax +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm17, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm18 ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm19, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm17 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm20[0],ymm2[0],ymm20[2],ymm2[2] ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 -; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm17 +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm27 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm28 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm2[1],ymm20[3],ymm2[3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm16 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm23 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm21[0],zmm11[0],zmm21[2],zmm11[2],zmm21[4],zmm11[4],zmm21[6],zmm11[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm29 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm24 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm21[1],zmm11[1],zmm21[3],zmm11[3],zmm21[5],zmm11[5],zmm21[7],zmm11[7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm15[0],zmm9[0],zmm15[2],zmm9[2],zmm15[4],zmm9[4],zmm15[6],zmm9[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm9[1],zmm15[3],zmm9[3],zmm15[5],zmm9[5],zmm15[7],zmm9[7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 -; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] -; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm15 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm29, %zmm29 +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm26[0],zmm7[2],zmm26[2],zmm7[4],zmm26[4],zmm7[6],zmm26[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm7[1],zmm26[1],zmm7[3],zmm26[3],zmm7[5],zmm26[5],zmm7[7],zmm26[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm25, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm6 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-BW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: popq %rax ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf16: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512DQ-BW-FCP-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: pushq %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm18 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 192(%rdi), %ymm19, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm20[0],ymm2[0],ymm20[2],ymm2[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm28 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm2[1],ymm20[3],ymm2[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm21[0],zmm11[0],zmm21[2],zmm11[2],zmm21[4],zmm11[4],zmm21[6],zmm11[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm29 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm21[1],zmm11[1],zmm21[3],zmm11[3],zmm21[5],zmm11[5],zmm21[7],zmm11[7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm15[0],zmm9[0],zmm15[2],zmm9[2],zmm15[4],zmm9[4],zmm15[6],zmm9[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm15[1],zmm9[1],zmm15[3],zmm9[3],zmm15[5],zmm9[5],zmm15[7],zmm9[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm29, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm26[0],zmm7[2],zmm26[2],zmm7[4],zmm26[4],zmm7[6],zmm26[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm7[1],zmm26[1],zmm7[3],zmm26[3],zmm7[5],zmm26[5],zmm7[7],zmm26[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: popq %rax ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -8735,4145 +8535,3905 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride8_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512-NEXT: movb $-64, %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] -; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-NEXT: vpermt2q %zmm21, %zmm5, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,3,11,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm27, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm14, %zmm2, %zmm27 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm21[0],zmm0[2],zmm21[2],zmm0[4],zmm21[4],zmm0[6],zmm21[6] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm16[0],zmm4[2],zmm16[2],zmm4[4],zmm16[4],zmm4[6],zmm16[6] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm21[1],zmm0[3],zmm21[3],zmm0[5],zmm21[5],zmm0[7],zmm21[7] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm4[1],zmm16[1],zmm4[3],zmm16[3],zmm4[5],zmm16[5],zmm4[7],zmm16[7] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512-NEXT: vpermi2q %zmm14, %zmm2, %zmm6 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm3[0],zmm1[0],zmm3[2],zmm1[2],zmm3[4],zmm1[4],zmm3[6],zmm1[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm1[1],zmm3[3],zmm1[3],zmm3[5],zmm1[5],zmm3[7],zmm1[7] +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: movb $-64, %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,0,0,8] +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,1,9] +; AVX512-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm23 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,4,12] +; AVX512-NEXT: vpermt2q %zmm2, %zmm29, %zmm26 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vpermt2q %zmm2, %zmm24, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm21[0],zmm2[0],zmm21[2],zmm2[2],zmm21[4],zmm2[4],zmm21[6],zmm2[6] +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm21[1],zmm2[1],zmm21[3],zmm2[3],zmm21[5],zmm2[5],zmm21[7],zmm2[7] +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm21 +; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm22 +; AVX512-NEXT: vpermt2q %zmm2, %zmm29, %zmm25 +; AVX512-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm18[0],zmm0[0],zmm18[2],zmm0[2],zmm18[4],zmm0[4],zmm18[6],zmm0[6] +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm29, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm31 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 1216(%rdi), %ymm19 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %ymm18 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm17[0],ymm10[2],ymm17[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512-NEXT: vmovdqa64 704(%rdi), %ymm16 +; AVX512-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX512-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX512-NEXT: vmovdqa 1600(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm17[1],ymm10[3],ymm17[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm11 +; AVX512-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,6,14] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-NEXT: vpermt2q %zmm10, %zmm8, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm18 = [0,0,7,15] +; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512-NEXT: vpermt2q %zmm14, %zmm8, %zmm6 +; AVX512-NEXT: vpermt2q %zmm14, %zmm18, %zmm13 +; AVX512-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-NEXT: vpermt2q %zmm19, %zmm8, %zmm12 +; AVX512-NEXT: vpermi2q %zmm16, %zmm11, %zmm8 +; AVX512-NEXT: vpermt2q %zmm16, %zmm18, %zmm11 +; AVX512-NEXT: vpermt2q %zmm19, %zmm18, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm29 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm19 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm14 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-NEXT: vpermt2q %zmm25, %zmm10, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm16 = [7,15] +; AVX512-NEXT: vpermt2q %zmm25, %zmm16, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm2 +; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512-NEXT: vpermi2q %zmm17, %zmm3, %zmm10 +; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm3 +; AVX512-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm15 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512-NEXT: vmovdqa64 512(%rdi), %xmm17 +; AVX512-NEXT: vinserti32x4 $1, 640(%rdi), %ymm17, %ymm17 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512-NEXT: vmovdqa64 1088(%rdi), %xmm20 +; AVX512-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm20, %ymm20 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 ; AVX512-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm27, %zmm28, %zmm27 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm28, %zmm20 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm21 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm17, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, (%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, (%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, (%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm23, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, (%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, (%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, (%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 128(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, (%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride8_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512-FCP-NEXT: movb $-64, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512-FCP-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,3,11,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm21[0],zmm0[2],zmm21[2],zmm0[4],zmm21[4],zmm0[6],zmm21[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm16[0],zmm4[2],zmm16[2],zmm4[4],zmm16[4],zmm4[6],zmm16[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm21[1],zmm0[3],zmm21[3],zmm0[5],zmm21[5],zmm0[7],zmm21[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm4[1],zmm16[1],zmm4[3],zmm16[3],zmm4[5],zmm16[5],zmm4[7],zmm16[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm3[0],zmm1[0],zmm3[2],zmm1[2],zmm3[4],zmm1[4],zmm3[6],zmm1[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm1[1],zmm3[3],zmm1[3],zmm3[5],zmm1[5],zmm3[7],zmm1[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512-FCP-NEXT: movb $-64, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,0,0,8] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,4,12] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm26 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm21[0],zmm2[0],zmm21[2],zmm2[2],zmm21[4],zmm2[4],zmm21[6],zmm2[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm21[1],zmm2[1],zmm21[3],zmm2[3],zmm21[5],zmm2[5],zmm21[7],zmm2[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm18[0],zmm0[0],zmm18[2],zmm0[2],zmm18[4],zmm0[4],zmm18[6],zmm0[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %ymm18 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm17[0],ymm10[2],ymm17[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %ymm16 +; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX512-FCP-NEXT: vmovdqa 1600(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm17[1],ymm10[3],ymm17[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm11 +; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm16 = [7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 -; AVX512-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %xmm17 +; AVX512-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm17, %ymm17 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %xmm20 +; AVX512-FCP-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm20, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 +; AVX512-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm27, %zmm28, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm28, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm17, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, (%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, (%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, (%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, (%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, (%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 128(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512-FCP-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512-FCP-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512-FCP-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride8_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512DQ-NEXT: movb $-64, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512DQ-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm5, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,3,11,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm2, %zmm27 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm21[0],zmm0[2],zmm21[2],zmm0[4],zmm21[4],zmm0[6],zmm21[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm16[0],zmm4[2],zmm16[2],zmm4[4],zmm16[4],zmm4[6],zmm16[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm21[1],zmm0[3],zmm21[3],zmm0[5],zmm21[5],zmm0[7],zmm21[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm4[1],zmm16[1],zmm4[3],zmm16[3],zmm4[5],zmm16[5],zmm4[7],zmm16[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm3[0],zmm1[0],zmm3[2],zmm1[2],zmm3[4],zmm1[4],zmm3[6],zmm1[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm1[1],zmm3[3],zmm1[3],zmm3[5],zmm1[5],zmm3[7],zmm1[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-NEXT: movb $-64, %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,0,0,8] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,1,9] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm23 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,4,12] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm29, %zmm26 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm24, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm21[0],zmm2[0],zmm21[2],zmm2[2],zmm21[4],zmm2[4],zmm21[6],zmm2[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm21[1],zmm2[1],zmm21[3],zmm2[3],zmm21[5],zmm2[5],zmm21[7],zmm2[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm31, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm29, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm18[0],zmm0[0],zmm18[2],zmm0[2],zmm18[4],zmm0[4],zmm18[6],zmm0[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm29, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %ymm19 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %ymm18 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm17[0],ymm10[2],ymm17[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %ymm16 +; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX512DQ-NEXT: vmovdqa 1600(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm17[1],ymm10[3],ymm17[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm11 +; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm8, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm18 = [0,0,7,15] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm8, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm18, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm8, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm11, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm18, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm18, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm14 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm10, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm16 = [7,15] +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm16, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm17, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm15 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512DQ-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512DQ-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512DQ-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512DQ-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %xmm17 +; AVX512DQ-NEXT: vinserti32x4 $1, 640(%rdi), %ymm17, %ymm17 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %xmm20 +; AVX512DQ-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm20, %ymm20 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512DQ-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512DQ-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512DQ-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm27, %zmm28, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm28, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm21 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm17, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, (%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, (%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, (%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, (%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 128(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512DQ-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: movb $-64, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,3,11,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm21[0],zmm0[2],zmm21[2],zmm0[4],zmm21[4],zmm0[6],zmm21[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm16[0],zmm4[2],zmm16[2],zmm4[4],zmm16[4],zmm4[6],zmm16[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm21[1],zmm0[3],zmm21[3],zmm0[5],zmm21[5],zmm0[7],zmm21[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm4[1],zmm16[1],zmm4[3],zmm16[3],zmm4[5],zmm16[5],zmm4[7],zmm16[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm3[0],zmm1[0],zmm3[2],zmm1[2],zmm3[4],zmm1[4],zmm3[6],zmm1[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm1[1],zmm3[3],zmm1[3],zmm3[5],zmm1[5],zmm3[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: movb $-64, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,0,0,8] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm26 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm21[0],zmm2[0],zmm21[2],zmm2[2],zmm21[4],zmm2[4],zmm21[6],zmm2[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm21[1],zmm2[1],zmm21[3],zmm2[3],zmm21[5],zmm2[5],zmm21[7],zmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm18[0],zmm0[0],zmm18[2],zmm0[2],zmm18[4],zmm0[4],zmm18[6],zmm0[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm17[0],ymm10[2],ymm17[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vmovdqa 1600(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm17[1],ymm10[3],ymm17[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm16 = [7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %xmm17 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm17, %ymm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %xmm20 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm20, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm27, %zmm28, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm28, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm17, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, (%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, (%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 128(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-FCP-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512BW-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,3,11,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm21[0],zmm0[2],zmm21[2],zmm0[4],zmm21[4],zmm0[6],zmm21[6] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm16[0],zmm4[2],zmm16[2],zmm4[4],zmm16[4],zmm4[6],zmm16[6] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm21[1],zmm0[3],zmm21[3],zmm0[5],zmm21[5],zmm0[7],zmm21[7] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm4[1],zmm16[1],zmm4[3],zmm16[3],zmm4[5],zmm16[5],zmm4[7],zmm16[7] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm6 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm3[0],zmm1[0],zmm3[2],zmm1[2],zmm3[4],zmm1[4],zmm3[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm1[1],zmm3[3],zmm1[3],zmm3[5],zmm1[5],zmm3[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,0,0,8] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,1,9] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm23 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,4,12] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm26 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm21[0],zmm2[0],zmm21[2],zmm2[2],zmm21[4],zmm2[4],zmm21[6],zmm2[6] +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm21[1],zmm2[1],zmm21[3],zmm2[3],zmm21[5],zmm2[5],zmm21[7],zmm2[7] +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm18[0],zmm0[0],zmm18[2],zmm0[2],zmm18[4],zmm0[4],zmm18[6],zmm0[6] +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm19 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm17[0],ymm10[2],ymm17[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %ymm16 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm17[1],ymm10[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [0,0,7,15] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm14 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm16 = [7,15] +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm17, %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm20, %ymm20 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm28, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm28, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm17, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512BW-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512BW-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride8_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: movb $-64, %al -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512BW-FCP-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,3,11,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm21[0],zmm0[2],zmm21[2],zmm0[4],zmm21[4],zmm0[6],zmm21[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm16[0],zmm4[2],zmm16[2],zmm4[4],zmm16[4],zmm4[6],zmm16[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm21[1],zmm0[3],zmm21[3],zmm0[5],zmm21[5],zmm0[7],zmm21[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm4[1],zmm16[1],zmm4[3],zmm16[3],zmm4[5],zmm16[5],zmm4[7],zmm16[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm3[0],zmm1[0],zmm3[2],zmm1[2],zmm3[4],zmm1[4],zmm3[6],zmm1[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm1[1],zmm3[3],zmm1[3],zmm3[5],zmm1[5],zmm3[7],zmm1[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: movb $-64, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,0,0,8] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm26 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm21[0],zmm2[0],zmm21[2],zmm2[2],zmm21[4],zmm2[4],zmm21[6],zmm2[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm21[1],zmm2[1],zmm21[3],zmm2[3],zmm21[5],zmm2[5],zmm21[7],zmm2[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm18[0],zmm0[0],zmm18[2],zmm0[2],zmm18[4],zmm0[4],zmm18[6],zmm0[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm31 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm19 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %ymm18 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm17[0],ymm10[2],ymm17[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX512BW-FCP-NEXT: vmovdqa 1600(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm17[1],ymm10[3],ymm17[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm16 = [7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm17, %ymm17 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %xmm20 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm20, %ymm20 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 ; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm27, %zmm28, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm28, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm17, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, (%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-FCP-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512BW-FCP-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-FCP-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride8_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: movb $-64, %al -; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512DQ-BW-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,3,11,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm21[0],zmm0[2],zmm21[2],zmm0[4],zmm21[4],zmm0[6],zmm21[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm16[0],zmm4[2],zmm16[2],zmm4[4],zmm16[4],zmm4[6],zmm16[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm21[1],zmm0[3],zmm21[3],zmm0[5],zmm21[5],zmm0[7],zmm21[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm4[1],zmm16[1],zmm4[3],zmm16[3],zmm4[5],zmm16[5],zmm4[7],zmm16[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm3[0],zmm1[0],zmm3[2],zmm1[2],zmm3[4],zmm1[4],zmm3[6],zmm1[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm1[1],zmm3[3],zmm1[3],zmm3[5],zmm1[5],zmm3[7],zmm1[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: movb $-64, %al +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm26 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm21[0],zmm2[0],zmm21[2],zmm2[2],zmm21[4],zmm2[4],zmm21[6],zmm2[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm21[1],zmm2[1],zmm21[3],zmm2[3],zmm21[5],zmm2[5],zmm21[7],zmm2[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm18[0],zmm0[0],zmm18[2],zmm0[2],zmm18[4],zmm0[4],zmm18[6],zmm0[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %ymm18 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm17[0],ymm10[2],ymm17[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX512DQ-BW-NEXT: vmovdqa 1600(%rdi), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm17[1],ymm10[3],ymm17[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm16 = [7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %xmm17 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm17, %ymm17 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %xmm20 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm20, %ymm20 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 ; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm27, %zmm28, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm28, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm17, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, (%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, (%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-BW-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512DQ-BW-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-BW-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: movb $-64, %al -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512DQ-BW-FCP-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,3,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm21[0],zmm0[2],zmm21[2],zmm0[4],zmm21[4],zmm0[6],zmm21[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm4[0],zmm16[0],zmm4[2],zmm16[2],zmm4[4],zmm16[4],zmm4[6],zmm16[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm2[0],zmm14[0],zmm2[2],zmm14[2],zmm2[4],zmm14[4],zmm2[6],zmm14[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm21[1],zmm0[3],zmm21[3],zmm0[5],zmm21[5],zmm0[7],zmm21[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm4[1],zmm16[1],zmm4[3],zmm16[3],zmm4[5],zmm16[5],zmm4[7],zmm16[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm14[1],zmm2[3],zmm14[3],zmm2[5],zmm14[5],zmm2[7],zmm14[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 = zmm3[0],zmm1[0],zmm3[2],zmm1[2],zmm3[4],zmm1[4],zmm3[6],zmm1[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm1[1],zmm3[3],zmm1[3],zmm3[5],zmm1[5],zmm3[7],zmm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: movb $-64, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm21[0],zmm2[0],zmm21[2],zmm2[2],zmm21[4],zmm2[4],zmm21[6],zmm2[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm21[1],zmm2[1],zmm21[3],zmm2[3],zmm21[5],zmm2[5],zmm21[7],zmm2[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm18[0],zmm0[0],zmm18[2],zmm0[2],zmm18[4],zmm0[4],zmm18[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm17[0],ymm10[2],ymm17[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm5[0],ymm14[2],ymm5[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1600(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm17[1],ymm10[3],ymm17[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm16 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm17, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %xmm20 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm20, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm21[0],ymm20[0],ymm21[2],ymm20[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm27, %zmm28, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm21[1],ymm20[1],ymm21[3],ymm20[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm28, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm21, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm17, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 @@ -18924,972 +18484,972 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride8_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512-NEXT: subq $6920, %rsp # imm = 0x1B08 +; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,2,10] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-NEXT: vmovdqa 3136(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 3072(%rdi), %ymm7 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512-NEXT: vmovdqa 3136(%rdi), %ymm9 +; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] -; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm19 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm27 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vpermt2q %zmm13, %zmm20, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm15, %zmm20, %zmm0 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-NEXT: vmovdqa 1600(%rdi), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-NEXT: vmovdqa64 1728(%rdi), %ymm19 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %ymm18 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512-NEXT: vmovdqa64 1600(%rdi), %ymm17 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %ymm24 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm17[0],ymm24[2],ymm17[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa 1216(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 1152(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512-NEXT: vmovdqa64 1088(%rdi), %ymm31 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %ymm16 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm31[0],ymm16[2],ymm31[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] -; AVX512-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 2688(%rdi), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512-NEXT: vmovdqa 2624(%rdi), %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512-NEXT: vmovdqa 2112(%rdi), %ymm10 -; AVX512-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa64 2176(%rdi), %ymm26 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX512-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 4032(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 3968(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3968(%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: vpermi2q %zmm13, %zmm1, %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 3904(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3840(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa64 3776(%rdi), %ymm17 -; AVX512-NEXT: vmovdqa64 3712(%rdi), %ymm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] -; AVX512-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 3840(%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 3776(%rdi), %ymm20 +; AVX512-NEXT: vmovdqa 3712(%rdi), %ymm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512-NEXT: vmovdqa 3648(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa 3584(%rdi), %ymm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,3,11] +; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX512-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm23, %zmm5, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm17[1],ymm24[3],ymm17[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm19, %zmm5, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm31[1],ymm16[3],ymm31[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm31, %zmm5, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm13, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vpermi2q %zmm26, %zmm13, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,4,12] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm3[0],mem[0],zmm3[2],mem[2],zmm3[4],mem[4],zmm3[6],mem[6] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] +; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,12] +; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm10 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm23[0],zmm25[2],zmm23[2],zmm25[4],zmm23[4],zmm25[6],zmm23[6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm24[0],zmm19[0],zmm24[2],zmm19[2],zmm24[4],zmm19[4],zmm24[6],zmm19[6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm7[0],zmm14[0],zmm7[2],zmm14[2],zmm7[4],zmm14[4],zmm7[6],zmm14[6] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3776(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3584(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 3648(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 3584(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm31, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm26[0],zmm13[2],zmm26[2],zmm13[4],zmm26[4],zmm13[6],zmm26[6] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,5,13] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm20[1],zmm24[3],zmm20[3],zmm24[5],zmm20[5],zmm24[7],zmm20[7] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm11[1],mem[1],zmm11[3],mem[3],zmm11[5],mem[5],zmm11[7],mem[7] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm31, %zmm7, %zmm2 +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = zmm25[1],mem[1],zmm25[3],mem[3],zmm25[5],mem[5],zmm25[7],mem[7] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 {%k1} = zmm20[0],mem[0],zmm20[2],mem[2],zmm20[4],mem[4],zmm20[6],mem[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm21[0],zmm18[0],zmm21[2],zmm18[2],zmm21[4],zmm18[4],zmm21[6],zmm18[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm30[0],zmm14[2],zmm30[2],zmm14[4],zmm30[4],zmm14[6],zmm30[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm9, %zmm6, %zmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm31[0],zmm7[2],zmm31[2],zmm7[4],zmm31[4],zmm7[6],zmm31[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512-NEXT: vpermt2q %zmm20, %zmm4, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-NEXT: vpermt2q %zmm27, %zmm17, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-NEXT: vpermt2q %zmm15, %zmm17, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-NEXT: vpermt2q %zmm15, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512-NEXT: vpermt2q %zmm13, %zmm17, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512-NEXT: vpermt2q %zmm3, %zmm17, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-NEXT: vpermi2q %zmm9, %zmm6, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 -; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermi2q %zmm9, %zmm25, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm25 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 -; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm9[1],zmm0[1],zmm9[3],zmm0[3],zmm9[5],zmm0[5],zmm9[7],zmm0[7] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,0,8] +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,0,1,9] +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 -; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm0[1],zmm23[3],zmm0[3],zmm23[5],zmm0[5],zmm23[7],zmm0[7] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm21[1],zmm0[1],zmm21[3],zmm0[3],zmm21[5],zmm0[5],zmm21[7],zmm0[7] ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-NEXT: vpermt2q %zmm22, %zmm15, %zmm28 +; AVX512-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512-NEXT: vpermt2q %zmm12, %zmm15, %zmm22 +; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,7,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm13 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm18 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512-NEXT: vinserti32x4 $1, 704(%rdi), %ymm7, %ymm27 +; AVX512-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm7[0],ymm27[0],ymm7[2],ymm27[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm4, %ymm20 +; AVX512-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX512-NEXT: vinserti128 $1, 1152(%rdi), %ymm4, %ymm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm23, %zmm1, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 1600(%rdi), %xmm23 +; AVX512-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm23, %ymm23 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %xmm30 +; AVX512-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm30, %ymm30 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} ; AVX512-NEXT: vmovdqa 2112(%rdi), %xmm6 ; AVX512-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm26[0],ymm6[0],ymm26[2],ymm6[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm8 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 -; AVX512-NEXT: vmovdqa64 2560(%rdi), %xmm23 -; AVX512-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 2624(%rdi), %xmm25 +; AVX512-NEXT: vinserti32x4 $1, 2752(%rdi), %ymm25, %ymm25 +; AVX512-NEXT: vmovdqa64 2560(%rdi), %xmm28 +; AVX512-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm24, %zmm1, %zmm24 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa 3648(%rdi), %xmm9 -; AVX512-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 -; AVX512-NEXT: vmovdqa 3584(%rdi), %xmm14 -; AVX512-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm24, 128(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm26, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa 3136(%rdi), %xmm9 +; AVX512-NEXT: vinserti128 $1, 3264(%rdi), %ymm9, %ymm9 +; AVX512-NEXT: vmovdqa64 3072(%rdi), %xmm29 +; AVX512-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm29, %ymm29 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm22, %zmm31, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512-NEXT: vmovdqa 3648(%rdi), %xmm1 +; AVX512-NEXT: vinserti128 $1, 3776(%rdi), %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa64 3584(%rdi), %xmm31 +; AVX512-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm31, %ymm31 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm31[0],ymm1[0],ymm31[2],ymm1[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm17, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm9 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm27[1],ymm7[3],ymm27[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm26[1],ymm6[1],ymm26[3],ymm6[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm31[1],ymm1[1],ymm31[3],ymm1[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm24, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, 256(%rsi) +; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, 320(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, 384(%rdx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19973,993 +19533,990 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512-NEXT: addq $6920, %rsp # imm = 0x1B08 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride8_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512-FCP-NEXT: subq $6920, %rsp # imm = 0x1B08 +; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-FCP-NEXT: vmovdqa 3136(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 3072(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512-FCP-NEXT: vmovdqa 3136(%rdi), %ymm9 +; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm19 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %ymm18 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm17 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %ymm24 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm17[0],ymm24[2],ymm17[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1152(%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm31 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %ymm16 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm31[0],ymm16[2],ymm31[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] -; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 2688(%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512-FCP-NEXT: vmovdqa 2624(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %ymm26 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512-FCP-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 3904(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa64 3712(%rdi), %ymm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] -; AVX512-FCP-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm20 +; AVX512-FCP-NEXT: vmovdqa 3712(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512-FCP-NEXT: vmovdqa 3648(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 3584(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,3,11] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm5, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm17[1],ymm24[3],ymm17[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm31[1],ymm16[3],ymm31[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm5, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm13, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,4,12] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = zmm3[0],mem[0],zmm3[2],mem[2],zmm3[4],mem[4],zmm3[6],mem[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,12] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm23[0],zmm25[2],zmm23[2],zmm25[4],zmm23[4],zmm25[6],zmm23[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm24[0],zmm19[0],zmm24[2],zmm19[2],zmm24[4],zmm19[4],zmm24[6],zmm19[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm7[0],zmm14[0],zmm7[2],zmm14[2],zmm7[4],zmm14[4],zmm7[6],zmm14[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm26[0],zmm13[2],zmm26[2],zmm13[4],zmm26[4],zmm13[6],zmm26[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,5,13] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm20[1],zmm24[3],zmm20[3],zmm24[5],zmm20[5],zmm24[7],zmm20[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm12 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm11[1],mem[1],zmm11[3],mem[3],zmm11[5],mem[5],zmm11[7],mem[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = zmm25[1],mem[1],zmm25[3],mem[3],zmm25[5],mem[5],zmm25[7],mem[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 {%k1} = zmm20[0],mem[0],zmm20[2],mem[2],zmm20[4],mem[4],zmm20[6],mem[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm21[0],zmm18[0],zmm21[2],zmm18[2],zmm21[4],zmm18[4],zmm21[6],zmm18[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm30[0],zmm14[2],zmm30[2],zmm14[4],zmm30[4],zmm14[6],zmm30[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm31[0],zmm7[2],zmm31[2],zmm7[4],zmm31[4],zmm7[6],zmm31[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm9[1],zmm0[1],zmm9[3],zmm0[3],zmm9[5],zmm0[5],zmm9[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,0,8] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm0[1],zmm23[3],zmm0[3],zmm23[5],zmm0[5],zmm23[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm21[1],zmm0[1],zmm21[3],zmm0[3],zmm21[5],zmm0[5],zmm21[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,7,15] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm13 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm7, %ymm27 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm7[0],ymm27[0],ymm7[2],ymm27[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm4, %ymm20 +; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm1, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm23 +; AVX512-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm23, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm30 +; AVX512-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm30, %ymm30 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} ; AVX512-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm26[0],ymm6[0],ymm26[2],ymm6[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23 -; AVX512-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %xmm25 +; AVX512-FCP-NEXT: vinserti32x4 $1, 2752(%rdi), %ymm25, %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm28 +; AVX512-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm24, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9 -; AVX512-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 -; AVX512-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14 -; AVX512-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa 3136(%rdi), %xmm9 +; AVX512-FCP-NEXT: vinserti128 $1, 3264(%rdi), %ymm9, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm29 +; AVX512-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm29, %ymm29 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm31, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqa 3648(%rdi), %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 3584(%rdi), %xmm31 +; AVX512-FCP-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm31, %ymm31 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm31[0],ymm1[0],ymm31[2],ymm1[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm27[1],ymm7[3],ymm27[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm26[1],ymm6[1],ymm26[3],ymm6[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm31[1],ymm1[1],ymm31[3],ymm1[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 320(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 256(%rsi) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 320(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 384(%rdx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21043,993 +20600,990 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512-FCP-NEXT: addq $6920, %rsp # imm = 0x1B08 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride8_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512DQ-NEXT: subq $6920, %rsp # imm = 0x1B08 +; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-NEXT: vmovdqa 3136(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 3072(%rdi), %ymm7 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512DQ-NEXT: vmovdqa 3136(%rdi), %ymm9 +; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm19 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm27 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-NEXT: vmovdqa 1600(%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %ymm19 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %ymm18 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %ymm17 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %ymm24 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm17[0],ymm24[2],ymm17[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa 1216(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 1152(%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %ymm31 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %ymm16 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm31[0],ymm16[2],ymm31[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] -; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 2688(%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512DQ-NEXT: vmovdqa 2624(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512DQ-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512DQ-NEXT: vmovdqa 2112(%rdi), %ymm10 -; AVX512DQ-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %ymm26 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512DQ-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX512DQ-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 4032(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 3968(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3968(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm13, %zmm1, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 3904(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3840(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa64 3776(%rdi), %ymm17 -; AVX512DQ-NEXT: vmovdqa64 3712(%rdi), %ymm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] -; AVX512DQ-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 3840(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 3776(%rdi), %ymm20 +; AVX512DQ-NEXT: vmovdqa 3712(%rdi), %ymm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512DQ-NEXT: vmovdqa 3648(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 3584(%rdi), %ymm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,3,11] +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm5, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm17[1],ymm24[3],ymm17[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm5, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm31[1],ymm16[3],ymm31[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm5, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm13, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm13, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,4,12] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = zmm3[0],mem[0],zmm3[2],mem[2],zmm3[4],mem[4],zmm3[6],mem[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,12] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm10 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm23[0],zmm25[2],zmm23[2],zmm25[4],zmm23[4],zmm25[6],zmm23[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm24[0],zmm19[0],zmm24[2],zmm19[2],zmm24[4],zmm19[4],zmm24[6],zmm19[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm7[0],zmm14[0],zmm7[2],zmm14[2],zmm7[4],zmm14[4],zmm7[6],zmm14[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3776(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3584(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 3648(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 3584(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm31, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm26[0],zmm13[2],zmm26[2],zmm13[4],zmm26[4],zmm13[6],zmm26[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,5,13] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm20[1],zmm24[3],zmm20[3],zmm24[5],zmm20[5],zmm24[7],zmm20[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm11[1],mem[1],zmm11[3],mem[3],zmm11[5],mem[5],zmm11[7],mem[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm31, %zmm7, %zmm2 +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = zmm25[1],mem[1],zmm25[3],mem[3],zmm25[5],mem[5],zmm25[7],mem[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 {%k1} = zmm20[0],mem[0],zmm20[2],mem[2],zmm20[4],mem[4],zmm20[6],mem[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm21[0],zmm18[0],zmm21[2],zmm18[2],zmm21[4],zmm18[4],zmm21[6],zmm18[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm30[0],zmm14[2],zmm30[2],zmm14[4],zmm30[4],zmm14[6],zmm30[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm6, %zmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm31[0],zmm7[2],zmm31[2],zmm7[4],zmm31[4],zmm7[6],zmm31[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm4, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm17, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm6, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm9[1],zmm0[1],zmm9[3],zmm0[3],zmm9[5],zmm0[5],zmm9[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,0,8] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,0,1,9] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm0[1],zmm23[3],zmm0[3],zmm23[5],zmm0[5],zmm23[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm21[1],zmm0[1],zmm21[3],zmm0[3],zmm21[5],zmm0[5],zmm21[7],zmm0[7] ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm15, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm15, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,7,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm13 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512DQ-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512DQ-NEXT: vinserti32x4 $1, 704(%rdi), %ymm7, %ymm27 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm7[0],ymm27[0],ymm7[2],ymm27[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512DQ-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512DQ-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm4, %ymm20 +; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, 1152(%rdi), %ymm4, %ymm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm23, %zmm1, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %xmm23 +; AVX512DQ-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm23, %ymm23 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %xmm30 +; AVX512DQ-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm30, %ymm30 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} ; AVX512DQ-NEXT: vmovdqa 2112(%rdi), %xmm6 ; AVX512DQ-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512DQ-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512DQ-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm26[0],ymm6[0],ymm26[2],ymm6[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 -; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %xmm23 -; AVX512DQ-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %xmm25 +; AVX512DQ-NEXT: vinserti32x4 $1, 2752(%rdi), %ymm25, %ymm25 +; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %xmm28 +; AVX512DQ-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm24, %zmm1, %zmm24 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512DQ-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512DQ-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa 3648(%rdi), %xmm9 -; AVX512DQ-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 -; AVX512DQ-NEXT: vmovdqa 3584(%rdi), %xmm14 -; AVX512DQ-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 128(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqa 3136(%rdi), %xmm9 +; AVX512DQ-NEXT: vinserti128 $1, 3264(%rdi), %ymm9, %ymm9 +; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %xmm29 +; AVX512DQ-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm29, %ymm29 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm22, %zmm31, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqa 3648(%rdi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, 3776(%rdi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 3584(%rdi), %xmm31 +; AVX512DQ-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm31, %ymm31 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm31[0],ymm1[0],ymm31[2],ymm1[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm27[1],ymm7[3],ymm27[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm26[1],ymm6[1],ymm26[3],ymm6[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm31[1],ymm1[1],ymm31[3],ymm1[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 320(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rsi) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 320(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 384(%rdx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -22113,993 +21667,990 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512DQ-NEXT: addq $6920, %rsp # imm = 0x1B08 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: subq $6920, %rsp # imm = 0x1B08 +; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vmovdqa 3136(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 3072(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512DQ-FCP-NEXT: vmovdqa 3136(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm19 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm17[0],ymm24[2],ymm17[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1152(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %ymm16 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm31[0],ymm16[2],ymm31[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 2688(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512DQ-FCP-NEXT: vmovdqa 2624(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512DQ-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %ymm26 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 3904(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 3712(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] -; AVX512DQ-FCP-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa 3712(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512DQ-FCP-NEXT: vmovdqa 3648(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 3584(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,3,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm17[1],ymm24[3],ymm17[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm31[1],ymm16[3],ymm31[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm13, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = zmm3[0],mem[0],zmm3[2],mem[2],zmm3[4],mem[4],zmm3[6],mem[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm23[0],zmm25[2],zmm23[2],zmm25[4],zmm23[4],zmm25[6],zmm23[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm24[0],zmm19[0],zmm24[2],zmm19[2],zmm24[4],zmm19[4],zmm24[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm7[0],zmm14[0],zmm7[2],zmm14[2],zmm7[4],zmm14[4],zmm7[6],zmm14[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm31, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm26[0],zmm13[2],zmm26[2],zmm13[4],zmm26[4],zmm13[6],zmm26[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,5,13] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm20[1],zmm24[3],zmm20[3],zmm24[5],zmm20[5],zmm24[7],zmm20[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm11[1],mem[1],zmm11[3],mem[3],zmm11[5],mem[5],zmm11[7],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm31, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm25[1],mem[1],zmm25[3],mem[3],zmm25[5],mem[5],zmm25[7],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 {%k1} = zmm20[0],mem[0],zmm20[2],mem[2],zmm20[4],mem[4],zmm20[6],mem[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm21[0],zmm18[0],zmm21[2],zmm18[2],zmm21[4],zmm18[4],zmm21[6],zmm18[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm30[0],zmm14[2],zmm30[2],zmm14[4],zmm30[4],zmm14[6],zmm30[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm31[0],zmm7[2],zmm31[2],zmm7[4],zmm31[4],zmm7[6],zmm31[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm9[1],zmm0[1],zmm9[3],zmm0[3],zmm9[5],zmm0[5],zmm9[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,0,8] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm0[1],zmm23[3],zmm0[3],zmm23[5],zmm0[5],zmm23[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm21[1],zmm0[1],zmm21[3],zmm0[3],zmm21[5],zmm0[5],zmm21[7],zmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,7,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm7, %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm7[0],ymm27[0],ymm7[2],ymm27[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm4, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm1, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm23 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm23, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm30 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm30, %ymm30 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm26[0],ymm6[0],ymm26[2],ymm6[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %xmm25 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2752(%rdi), %ymm25, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm28 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm24, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 3136(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 3264(%rdi), %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm29 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm29, %ymm29 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm31, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 3648(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 3584(%rdi), %xmm31 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm31, %ymm31 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm31[0],ymm1[0],ymm31[2],ymm1[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm27[1],ymm7[3],ymm27[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm26[1],ymm6[1],ymm26[3],ymm6[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm31[1],ymm1[1],ymm31[3],ymm1[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 320(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 256(%rsi) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 320(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 384(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23183,993 +22734,990 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512DQ-FCP-NEXT: addq $6920, %rsp # imm = 0x1B08 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512BW-NEXT: subq $6920, %rsp # imm = 0x1B08 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,2,10] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm9 +; AVX512BW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm22 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %ymm19 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm17 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm17[0],ymm24[2],ymm17[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm31 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm31[0],ymm16[2],ymm31[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm10 -; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 4032(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %ymm17 -; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,3,11] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm17[1],ymm24[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm31[1],ymm16[3],ymm31[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,4,12] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm3[0],mem[0],zmm3[2],mem[2],zmm3[4],mem[4],zmm3[6],mem[6] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,12] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm23[0],zmm25[2],zmm23[2],zmm25[4],zmm23[4],zmm25[6],zmm23[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm24[0],zmm19[0],zmm24[2],zmm19[2],zmm24[4],zmm19[4],zmm24[6],zmm19[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm7[0],zmm14[0],zmm7[2],zmm14[2],zmm7[4],zmm14[4],zmm7[6],zmm14[6] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm26[0],zmm13[2],zmm26[2],zmm13[4],zmm26[4],zmm13[6],zmm26[6] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,5,13] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm20[1],zmm24[3],zmm20[3],zmm24[5],zmm20[5],zmm24[7],zmm20[7] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm11[1],mem[1],zmm11[3],mem[3],zmm11[5],mem[5],zmm11[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm7, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm1 = zmm25[1],mem[1],zmm25[3],mem[3],zmm25[5],mem[5],zmm25[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 {%k1} = zmm20[0],mem[0],zmm20[2],mem[2],zmm20[4],mem[4],zmm20[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm21[0],zmm18[0],zmm21[2],zmm18[2],zmm21[4],zmm18[4],zmm21[6],zmm18[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm30[0],zmm14[2],zmm30[2],zmm14[4],zmm30[4],zmm14[6],zmm30[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm31[0],zmm7[2],zmm31[2],zmm7[4],zmm31[4],zmm7[6],zmm31[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm9[1],zmm0[1],zmm9[3],zmm0[3],zmm9[5],zmm0[5],zmm9[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,0,8] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,0,1,9] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm0[1],zmm23[3],zmm0[3],zmm23[5],zmm0[5],zmm23[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm21[1],zmm0[1],zmm21[3],zmm0[3],zmm21[5],zmm0[5],zmm21[7],zmm0[7] ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,7,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm13 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm7, %ymm27 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm7[0],ymm27[0],ymm7[2],ymm27[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512BW-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm4, %ymm20 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm4, %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm23, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm23 +; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm23, %ymm23 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm30 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm30, %ymm30 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm6 ; AVX512BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm26[0],ymm6[0],ymm26[2],ymm6[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm23 -; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %xmm25 +; AVX512BW-NEXT: vinserti32x4 $1, 2752(%rdi), %ymm25, %ymm25 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm28 +; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm24, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vmovdqa 3584(%rdi), %xmm14 -; AVX512BW-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm9, %ymm9 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm29 +; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm29, %ymm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm31, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm1 +; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm31 +; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm31, %ymm31 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm31[0],ymm1[0],ymm31[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm27[1],ymm7[3],ymm27[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm26[1],ymm6[1],ymm26[3],ymm6[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm31[1],ymm1[1],ymm31[3],ymm1[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -24253,993 +23801,990 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512BW-NEXT: addq $6920, %rsp # imm = 0x1B08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride8_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: subq $6920, %rsp # imm = 0x1B08 +; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-FCP-NEXT: vmovdqa 3136(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 3072(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512BW-FCP-NEXT: vmovdqa 3136(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm19 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %ymm19 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %ymm18 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm17[0],ymm24[2],ymm17[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm31 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %ymm16 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm31[0],ymm16[2],ymm31[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] -; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 2688(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512BW-FCP-NEXT: vmovdqa 2624(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512BW-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %ymm26 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 3904(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vmovdqa64 3712(%rdi), %ymm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] -; AVX512BW-FCP-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm20 +; AVX512BW-FCP-NEXT: vmovdqa 3712(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512BW-FCP-NEXT: vmovdqa 3648(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa 3584(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,3,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm17[1],ymm24[3],ymm17[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm31[1],ymm16[3],ymm31[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm3 = zmm3[0],mem[0],zmm3[2],mem[2],zmm3[4],mem[4],zmm3[6],mem[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm23[0],zmm25[2],zmm23[2],zmm25[4],zmm23[4],zmm25[6],zmm23[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm24[0],zmm19[0],zmm24[2],zmm19[2],zmm24[4],zmm19[4],zmm24[6],zmm19[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm7[0],zmm14[0],zmm7[2],zmm14[2],zmm7[4],zmm14[4],zmm7[6],zmm14[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm26[0],zmm13[2],zmm26[2],zmm13[4],zmm26[4],zmm13[6],zmm26[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,5,13] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm20[1],zmm24[3],zmm20[3],zmm24[5],zmm20[5],zmm24[7],zmm20[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm12 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm11[1],mem[1],zmm11[3],mem[3],zmm11[5],mem[5],zmm11[7],mem[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm1 = zmm25[1],mem[1],zmm25[3],mem[3],zmm25[5],mem[5],zmm25[7],mem[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm4 {%k1} = zmm20[0],mem[0],zmm20[2],mem[2],zmm20[4],mem[4],zmm20[6],mem[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm21[0],zmm18[0],zmm21[2],zmm18[2],zmm21[4],zmm18[4],zmm21[6],zmm18[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm4 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm30[0],zmm14[2],zmm30[2],zmm14[4],zmm30[4],zmm14[6],zmm30[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm31[0],zmm7[2],zmm31[2],zmm7[4],zmm31[4],zmm7[6],zmm31[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm9[1],zmm0[1],zmm9[3],zmm0[3],zmm9[5],zmm0[5],zmm9[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,0,8] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm0[1],zmm23[3],zmm0[3],zmm23[5],zmm0[5],zmm23[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm21[1],zmm0[1],zmm21[3],zmm0[3],zmm21[5],zmm0[5],zmm21[7],zmm0[7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,7,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm7, %ymm27 +; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm7[0],ymm27[0],ymm7[2],ymm27[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm4, %ymm20 +; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm23 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm23, %ymm23 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm30 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm30, %ymm30 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6 ; AVX512BW-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm26[0],ymm6[0],ymm26[2],ymm6[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %xmm25 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2752(%rdi), %ymm25, %ymm25 +; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm28 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm24, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 3136(%rdi), %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, 3264(%rdi), %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm29 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm29, %ymm29 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm31, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 3648(%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa64 3584(%rdi), %xmm31 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm31, %ymm31 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm31[0],ymm1[0],ymm31[2],ymm1[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm27[1],ymm7[3],ymm27[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm26[1],ymm6[1],ymm26[3],ymm6[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm31[1],ymm1[1],ymm31[3],ymm1[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rsi) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 320(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 384(%rdx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -25323,993 +24868,990 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512BW-FCP-NEXT: addq $6920, %rsp # imm = 0x1B08 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride8_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: subq $6920, %rsp # imm = 0x1B08 +; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vmovdqa 3136(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 3072(%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512DQ-BW-NEXT: vmovdqa 3136(%rdi), %ymm9 +; AVX512DQ-BW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm19 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm27 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vmovdqa 1600(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %ymm18 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %ymm24 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm17[0],ymm24[2],ymm17[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1216(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 1152(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %ymm31 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %ymm16 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm31[0],ymm16[2],ymm31[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] -; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 2688(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512DQ-BW-NEXT: vmovdqa 2624(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512DQ-BW-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512DQ-BW-NEXT: vmovdqa 2112(%rdi), %ymm10 -; AVX512DQ-BW-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %ymm26 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512DQ-BW-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 4032(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 3968(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3968(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm13, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 3904(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3840(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 3776(%rdi), %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 3712(%rdi), %ymm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] -; AVX512DQ-BW-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 3840(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 3776(%rdi), %ymm20 +; AVX512DQ-BW-NEXT: vmovdqa 3712(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512DQ-BW-NEXT: vmovdqa 3648(%rdi), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa 3584(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm17[1],ymm24[3],ymm17[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm31[1],ymm16[3],ymm31[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm13, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm3 = zmm3[0],mem[0],zmm3[2],mem[2],zmm3[4],mem[4],zmm3[6],mem[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm23[0],zmm25[2],zmm23[2],zmm25[4],zmm23[4],zmm25[6],zmm23[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm24[0],zmm19[0],zmm24[2],zmm19[2],zmm24[4],zmm19[4],zmm24[6],zmm19[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm7[0],zmm14[0],zmm7[2],zmm14[2],zmm7[4],zmm14[4],zmm7[6],zmm14[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3776(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3584(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 3648(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3584(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm26[0],zmm13[2],zmm26[2],zmm13[4],zmm26[4],zmm13[6],zmm26[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,5,13] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm20[1],zmm24[3],zmm20[3],zmm24[5],zmm20[5],zmm24[7],zmm20[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm5 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm12 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm5 = zmm11[1],mem[1],zmm11[3],mem[3],zmm11[5],mem[5],zmm11[7],mem[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm5 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm31, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm1 = zmm25[1],mem[1],zmm25[3],mem[3],zmm25[5],mem[5],zmm25[7],mem[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm4 {%k1} = zmm20[0],mem[0],zmm20[2],mem[2],zmm20[4],mem[4],zmm20[6],mem[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm21[0],zmm18[0],zmm21[2],zmm18[2],zmm21[4],zmm18[4],zmm21[6],zmm18[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm4 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm30[0],zmm14[2],zmm30[2],zmm14[4],zmm30[4],zmm14[6],zmm30[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm31[0],zmm7[2],zmm31[2],zmm7[4],zmm31[4],zmm7[6],zmm31[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm9[1],zmm0[1],zmm9[3],zmm0[3],zmm9[5],zmm0[5],zmm9[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm0[1],zmm23[3],zmm0[3],zmm23[5],zmm0[5],zmm23[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm21[1],zmm0[1],zmm21[3],zmm0[3],zmm21[5],zmm0[5],zmm21[7],zmm0[7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,7,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm7, %ymm27 +; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm7[0],ymm27[0],ymm7[2],ymm27[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm4, %ymm20 +; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm23, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %xmm23 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm23, %ymm23 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %xmm30 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm30, %ymm30 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 2112(%rdi), %xmm6 ; AVX512DQ-BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm26[0],ymm6[0],ymm26[2],ymm6[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 -; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %xmm23 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %xmm25 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2752(%rdi), %ymm25, %ymm25 +; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %xmm28 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm24, %zmm1, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 3648(%rdi), %xmm9 -; AVX512DQ-BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa 3584(%rdi), %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 3136(%rdi), %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm9, %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %xmm29 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm29, %ymm29 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm22, %zmm31, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 3648(%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa64 3584(%rdi), %xmm31 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm31, %ymm31 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm31[0],ymm1[0],ymm31[2],ymm1[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm27[1],ymm7[3],ymm27[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm26[1],ymm6[1],ymm26[3],ymm6[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm31[1],ymm1[1],ymm31[3],ymm1[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 320(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%rsi) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 320(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 384(%rdx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -26393,993 +25935,990 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512DQ-BW-NEXT: addq $6920, %rsp # imm = 0x1B08 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: subq $6920, %rsp # imm = 0x1B08 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3136(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3072(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3136(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %ymm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm17[0],ymm24[2],ymm17[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm31[0],ymm16[2],ymm31[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2688(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2624(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %ymm26 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 4032(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3968(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm13, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3904(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3712(%rdi), %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3840(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3776(%rdi), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3712(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3648(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3584(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm17[1],ymm24[3],ymm17[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm31[1],ymm16[3],ymm31[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm13, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm3[0],mem[0],zmm3[2],mem[2],zmm3[4],mem[4],zmm3[6],mem[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm23[0],zmm25[2],zmm23[2],zmm25[4],zmm23[4],zmm25[6],zmm23[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm24[0],zmm19[0],zmm24[2],zmm19[2],zmm24[4],zmm19[4],zmm24[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm7[0],zmm14[0],zmm7[2],zmm14[2],zmm7[4],zmm14[4],zmm7[6],zmm14[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3648(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3584(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm31, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm26[0],zmm13[2],zmm26[2],zmm13[4],zmm26[4],zmm13[6],zmm26[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,5,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm24[1],zmm20[1],zmm24[3],zmm20[3],zmm24[5],zmm20[5],zmm24[7],zmm20[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm12 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm5[1],mem[1],zmm5[3],mem[3],zmm5[5],mem[5],zmm5[7],mem[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm11[1],mem[1],zmm11[3],mem[3],zmm11[5],mem[5],zmm11[7],mem[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm31, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm25[1],mem[1],zmm25[3],mem[3],zmm25[5],mem[5],zmm25[7],mem[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm4 {%k1} = zmm20[0],mem[0],zmm20[2],mem[2],zmm20[4],mem[4],zmm20[6],mem[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm21[0],zmm18[0],zmm21[2],zmm18[2],zmm21[4],zmm18[4],zmm21[6],zmm18[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm4 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm30[0],zmm14[2],zmm30[2],zmm14[4],zmm30[4],zmm14[6],zmm30[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm31[0],zmm7[2],zmm31[2],zmm7[4],zmm31[4],zmm7[6],zmm31[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm6, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm9[1],zmm0[1],zmm9[3],zmm0[3],zmm9[5],zmm0[5],zmm9[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm0[1],zmm23[3],zmm0[3],zmm23[5],zmm0[5],zmm23[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm21[1],zmm0[1],zmm21[3],zmm0[3],zmm21[5],zmm0[5],zmm21[7],zmm0[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm7, %ymm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm7, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm7[0],ymm27[0],ymm7[2],ymm27[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm4, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm4[0],ymm20[0],ymm4[2],ymm20[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm23 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm23, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm30 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm30, %ymm30 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm26[0],ymm6[0],ymm26[2],ymm6[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %xmm25 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2752(%rdi), %ymm25, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm28 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm24, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3136(%rdi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3264(%rdi), %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm29 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm29, %ymm29 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm31, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3648(%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3584(%rdi), %xmm31 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm31, %ymm31 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm31[0],ymm1[0],ymm31[2],ymm1[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm27[1],ymm7[3],ymm27[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm20[1],ymm4[3],ymm20[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm26[1],ymm6[1],ymm26[3],ymm6[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm31[1],ymm1[1],ymm31[3],ymm1[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 320(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 384(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -27463,22 +27002,19 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512DQ-BW-FCP-NEXT: addq $6920, %rsp # imm = 0x1B08 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <512 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index d1d7cb0a34332..848f2f1c94b7c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -1410,13 +1410,11 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-NEXT: vmovdqa %ymm4, (%rdx) +; AVX512-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512-NEXT: vmovdqa %ymm2, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1438,13 +1436,11 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1466,13 +1462,11 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512DQ-NEXT: vmovdqa %ymm4, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1494,13 +1488,11 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -2432,18 +2424,16 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8)) ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm5 & (zmm8 ^ zmm2)) ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] -; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX512-NEXT: vmovdqa %ymm2, 32(%rsi) +; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] +; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] +; AVX512-NEXT: vmovdqa %ymm3, 32(%rsi) ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512-NEXT: vmovdqa %ymm4, 32(%rcx) ; AVX512-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512-NEXT: vzeroupper @@ -2479,18 +2469,16 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8)) ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm5 & (zmm8 ^ zmm2)) ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX512-FCP-NEXT: vmovdqa %ymm2, 32(%rsi) +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] +; AVX512-FCP-NEXT: vmovdqa %ymm3, 32(%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %ymm4, 32(%rcx) ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512-FCP-NEXT: vzeroupper @@ -2526,18 +2514,16 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8)) ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm5 & (zmm8 ^ zmm2)) ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX512DQ-NEXT: vmovdqa %ymm2, 32(%rsi) +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] +; AVX512DQ-NEXT: vmovdqa %ymm3, 32(%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm4, 32(%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -2573,18 +2559,16 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8)) ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm5 & (zmm8 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 32(%rsi) +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26] +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 32(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, 32(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index abef980277ece..abc0453bf4db0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -1625,7 +1625,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,4,8,12] ; AVX512-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vpmovdb %zmm2, %xmm5 @@ -1665,7 +1665,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,4,8,12] ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vpmovdb %zmm2, %xmm5 @@ -1705,7 +1705,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,4,8,12] ; AVX512DQ-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm5 @@ -1745,7 +1745,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpmovdb %zmm2, %xmm5 @@ -3027,7 +3027,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,4,8,12] ; AVX512-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3097,7 +3097,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,4,8,12] ; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512-FCP-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3167,7 +3167,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,4,8,12] ; AVX512DQ-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3237,7 +3237,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512DQ-FCP-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index ac14f55e3f0ed..2010fc7ad1c4b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -6395,203 +6395,204 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i8_stride5_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm23 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm24 +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm22 ; AVX512-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm9, %ymm6, %ymm6 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm19) | ymm6 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX512-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) -; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm25 ^ (ymm11 & (ymm7 ^ ymm25)) +; AVX512-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm8 ^ (mem & (ymm11 ^ ymm8)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512-NEXT: vpermd %ymm12, %ymm17, %ymm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,5,0,0] +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermd %ymm13, %ymm12, %ymm14 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ~ymm17) | ymm11 +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm20 & (zmm9 ^ zmm10)) +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512-NEXT: vmovdqa 288(%rdi), %ymm10 ; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm16 +; AVX512-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm13 ^ ymm10)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,ymm9[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 176(%rdi), %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm17) | ymm15 +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm18 & (ymm1 ^ ymm2)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm22 ^ ymm21)) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm24 ^ (ymm0 & (ymm23 ^ ymm24)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13,u,u,u] +; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm19) | ymm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm20 & (zmm2 ^ zmm0)) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm13 ^ ymm10)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[2,7,12],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm18 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] +; AVX512-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 +; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm10 ^ ymm13)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,8,13],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm18 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm13 ^ (ymm4 & (ymm10 ^ ymm13)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm25 ^ (ymm15 & (ymm7 ^ ymm25)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm8 ^ (mem & (ymm15 ^ ymm8)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm18 & (ymm3 ^ ymm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] +; AVX512-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: vzeroupper @@ -6600,203 +6601,204 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i8_stride5_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm19) | ymm6 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) -; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm25 ^ (ymm11 & (ymm7 ^ ymm25)) +; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm8 ^ (mem & (ymm11 ^ ymm8)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,5,0,0] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ~ymm17) | ymm11 +; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm20 & (zmm9 ^ zmm10)) +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm16 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm13 ^ ymm10)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,ymm9[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm17) | ymm15 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm18 & (ymm1 ^ ymm2)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm22 ^ ymm21)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm24 ^ (ymm0 & (ymm23 ^ ymm24)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm19) | ymm3 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm20 & (zmm2 ^ zmm0)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm13 ^ ymm10)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[2,7,12],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm18 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] +; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm10 ^ ymm13)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,8,13],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm18 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm13 ^ (ymm4 & (ymm10 ^ ymm13)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm25 ^ (ymm15 & (ymm7 ^ ymm25)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm8 ^ (mem & (ymm15 ^ ymm8)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm18 & (ymm3 ^ ymm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -6805,203 +6807,204 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i8_stride5_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm23 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm24 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm22 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm6, %ymm6 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm19) | ymm6 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) -; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm25 ^ (ymm11 & (ymm7 ^ ymm25)) +; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm8 ^ (mem & (ymm11 ^ ymm8)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512DQ-NEXT: vpermd %ymm12, %ymm17, %ymm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,5,0,0] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermd %ymm13, %ymm12, %ymm14 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ~ymm17) | ymm11 +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm20 & (zmm9 ^ zmm10)) +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm10 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm16 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm13 ^ ymm10)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,ymm9[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm17) | ymm15 +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm18 & (ymm1 ^ ymm2)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm22 ^ ymm21)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm24 ^ (ymm0 & (ymm23 ^ ymm24)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13,u,u,u] +; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm19) | ymm3 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm20 & (zmm2 ^ zmm0)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm17 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm13 ^ ymm10)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[2,7,12],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm18 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] +; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm10 ^ ymm13)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,8,13],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm18 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm13 ^ (ymm4 & (ymm10 ^ ymm13)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm25 ^ (ymm15 & (ymm7 ^ ymm25)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm8 ^ (mem & (ymm15 ^ ymm8)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm18 & (ymm3 ^ ymm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] +; AVX512DQ-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-NEXT: vzeroupper @@ -7010,203 +7013,204 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm19) | ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm25 ^ (ymm11 & (ymm7 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm8 ^ (mem & (ymm11 ^ ymm8)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,5,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ~ymm17) | ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm20 & (zmm9 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm13 ^ ymm10)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,ymm9[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm17) | ymm15 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm18 & (ymm1 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm22 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm24 ^ (ymm0 & (ymm23 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm19) | ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm20 & (zmm2 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm13 ^ ymm10)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[2,7,12],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm18 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm10 ^ ymm13)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,8,13],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm18 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm13 ^ (ymm4 & (ymm10 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm25 ^ (ymm15 & (ymm7 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm8 ^ (mem & (ymm15 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm18 & (ymm3 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -7231,164 +7235,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm9 +; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm8 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512BW-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] -; AVX512BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 +; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,5,0,0] +; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpermd %ymm9, %ymm7, %ymm7 ; AVX512BW-NEXT: movl $127, %eax ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} -; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k4} +; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm8, %ymm9 +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm12 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u],zero,zero,zero,xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,4,9,14],zero,zero,zero,xmm12[2,7,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm18 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k3} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] -; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm12, %ymm9 {%k6} +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm12 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero,xmm12[3,8,13,u,u,u] +; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm9 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] ; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm9 {%k6} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[1,6,11],zero,zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 -; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm12 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm12, %ymm14 +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm15 {%k1} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm19 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] -; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] +; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] ; AVX512BW-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm4 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 -; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] -; AVX512BW-NEXT: vporq %xmm17, %xmm15, %xmm15 -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm8 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm15 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm8 +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm14 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero +; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] +; AVX512BW-NEXT: vporq %xmm16, %xmm14, %xmm14 +; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm8 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm14 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] -; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm8 {%k5} +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] +; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm8 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm8[2,3,0,1] ; AVX512BW-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm9 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm8 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm9[3,8,13],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm12 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm8[3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 ; AVX512BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm8, %ymm10 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} ; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm10 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7401,35 +7405,36 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[4,9,14],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512BW-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -7453,164 +7458,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm8 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] -; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,5,0,0] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm7, %ymm7 ; AVX512BW-FCP-NEXT: movl $127, %eax ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11] +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm8, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm12 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u],zero,zero,zero,xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,4,9,14],zero,zero,zero,xmm12[2,7,12],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm18 ; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k3} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm9 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm12 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero,xmm12[3,8,13,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm9 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm9 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[1,6,11],zero,zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k4} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm14 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm19 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm8 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm14, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm15 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm8 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm14 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm8 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm14 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm8 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm8 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm8[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm8 {%k5} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm9[3,8,13],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm12 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm8[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm8 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm8, %ymm10 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7623,35 +7628,36 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[4,9,14],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -7675,164 +7681,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm9 +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm8 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,5,0,0] +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermd %ymm9, %ymm7, %ymm7 ; AVX512DQ-BW-NEXT: movl $127, %eax ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11] +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm8, %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm12 {%k2} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u],zero,zero,zero,xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,4,9,14],zero,zero,zero,xmm12[2,7,12],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm18 ; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm12, %ymm9 {%k6} +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm12 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero,xmm12[3,8,13,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm9 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm9 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[1,6,11],zero,zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 -; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k4} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm12 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm12, %ymm14 +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm15 {%k1} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm19 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm4 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 -; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm15, %xmm15 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm8 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm14, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm15 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm15, %ymm8 +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm14 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm14, %xmm14 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm8 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm14 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm8 {%k5} +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm8 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm8[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm9 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm8 {%k5} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm9[3,8,13],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm12 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm8[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm8, %ymm10 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm10 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7845,35 +7851,36 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[4,9,14],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -7897,164 +7904,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm8 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm7, %ymm7 ; AVX512DQ-BW-FCP-NEXT: movl $127, %eax ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm8, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u],zero,zero,zero,xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,4,9,14],zero,zero,zero,xmm12[2,7,12],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm18 ; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm9 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero,xmm12[3,8,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm9 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[1,6,11],zero,zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm14, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm15 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm8 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm8[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm9[3,8,13],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm8[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm8, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -8067,35 +8074,36 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[4,9,14],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index ca8fcf2ee0f2c..5a762219ca0c1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -526,8 +526,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8,10,1,3,8,10,1,3] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,10,0,0,0,0,1,3] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 @@ -594,8 +593,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8,10,1,3,8,10,1,3] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,10,0,0,0,0,1,3] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 @@ -1032,8 +1030,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,1,5,2,6,0,0] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 @@ -1045,22 +1042,19 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,0,0,4,1,5] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,4,0,0,4,0,0] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,3,7,7,3,0,0] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 @@ -1124,8 +1118,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,1,5,2,6,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 @@ -1137,22 +1130,19 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,0,0,4,1,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,4,0,0,4,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,3,7,7,3,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index fdbe919c0f73e..fb0df02705479 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -670,12 +670,10 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,2,0,0,4,2,0] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,3,7,1,5,3,7] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,1,5,3,0] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -752,12 +750,10 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,2,0,0,4,2,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,3,7,1,5,3,7] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,1,5,3,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3034,8 +3030,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 @@ -3296,8 +3291,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 @@ -6242,7 +6236,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm22 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 @@ -6331,7 +6325,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm31 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] @@ -6352,7 +6346,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7,8,9],ymm1[10],ymm9[11,12],ymm1[13],ymm9[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm13 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] @@ -6361,9 +6355,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm22, %ymm1 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] @@ -6413,8 +6406,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm17 & (zmm12 ^ zmm24)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [14,13,8,8,15,14,8,15,14,13,8,8,15,14,8,15] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm14)) ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,0,0,0,7,0,0,7] @@ -6432,7 +6424,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm24[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,0,1,3] @@ -6444,7 +6436,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm20, %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload @@ -6867,7 +6859,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 @@ -6956,7 +6948,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm31 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] @@ -6977,7 +6969,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7,8,9],ymm1[10],ymm9[11,12],ymm1[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm13 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] @@ -6986,9 +6978,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm22, %ymm1 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] @@ -7038,8 +7029,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm17 & (zmm12 ^ zmm24)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [14,13,8,8,15,14,8,15,14,13,8,8,15,14,8,15] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm14)) ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,0,0,0,7,0,0,7] @@ -7057,7 +7047,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm24[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,0,1,3] @@ -7069,7 +7059,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload @@ -13229,7 +13219,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-FCP-LABEL: store_i16_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1544, %rsp # imm = 0x608 +; AVX512-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8 ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 @@ -13256,48 +13246,48 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm15 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm13 +; AVX512-FCP-NEXT: vpor %ymm15, %ymm13, %ymm13 +; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 @@ -13315,32 +13305,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm12 -; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (mem & (zmm12 ^ zmm11)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm14) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm13) ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm11 & ymm16) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 ; AVX512-FCP-NEXT: vprold $16, %ymm6, %ymm11 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] @@ -13351,38 +13341,35 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] ; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm4 ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpandn %ymm11, %ymm14, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512-FCP-NEXT: vpandn %ymm11, %ymm13, %ymm11 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8,9,10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [151522058,0,421010202,421010202] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [151522058,0,421010202,421010202] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm24 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [218894094,0,488382238,488382238] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,2,3,10,9,11,11] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [218894094,0,488382238,488382238] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm11 & (zmm1 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm23 & (zmm1 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 @@ -13402,35 +13389,34 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,3,10,10,11,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm2 ; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm7 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm11 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm23 & (zmm0 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] @@ -13438,70 +13424,68 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vprold $16, %ymm10, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,2,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm16 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,13,8,8,15,14,8,15,14,13,8,8,15,14,8,15] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm18 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm10 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (mem & (zmm5 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm17 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm10 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm31 & (zmm3 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm15 ; AVX512-FCP-NEXT: vprold $16, %ymm25, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm25 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm13 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm29[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [8,9,9,0,0,0,1,1] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm1 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [8,9,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm7 @@ -13509,9 +13493,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,1,1,8,8,0,9] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] @@ -13522,61 +13506,60 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm6 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,9,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm1 ; AVX512-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6 ; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm1)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm30 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm20 & (zmm30 ^ zmm1)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm1 & (zmm30 ^ zmm5)) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm6 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vprold $16, %xmm13, %xmm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm3 & (zmm8 ^ zmm5)) ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm5 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm3 ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm30 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm11 & (zmm30 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm1 & (zmm30 ^ zmm8)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm29 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm20 & (zmm29 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ zmm8)) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -13587,196 +13570,191 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm11 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm13 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,2,2,3,8,9,9,0] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm12 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7,8,9,10],ymm5[11],ymm8[12,13],ymm5[14],ymm8[15] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm3 ^ (mem & (zmm1 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm3 ^ (zmm31 & (zmm1 ^ zmm3)) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-FCP-NEXT: vprold $16, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vprold $16, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 ^ (zmm17 & (zmm8 ^ zmm12)) +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 ^ (zmm23 & (zmm8 ^ zmm12)) ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm23, %zmm16 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm3)) +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm11)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm1)) ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm25 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm26 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm14 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm6 -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm18 -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm15 +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm17 & (zmm14 ^ zmm0)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [6,7,3,3,7,7,6,7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm23 & (zmm14 ^ zmm0)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [6,7,3,3,7,7,6,7] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm19 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm24 & (zmm19 ^ zmm12)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm8)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512-FCP-NEXT: vprold $16, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm12)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm27 & (zmm19 ^ zmm8)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm31 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,1,8,8,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm22 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm0 ^ (zmm26 & (zmm11 ^ zmm0)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,8,9,0] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm11 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm21 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6],xmm13[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,8,8,0,9] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm28 & (zmm13 ^ zmm0)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm24 & (ymm20 ^ ymm0)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm22 = ymm22 ^ (ymm23 & (ymm22 ^ ymm0)) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [8,9,9,0,0,0,1,1] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [8,9,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,0,1,1] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,0,1,1] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm31 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,0,1,1] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm7 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm15 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm24 & (zmm7 ^ zmm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm1)) ; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1 ; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm15 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm24 & (zmm15 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm14)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm20 & (zmm15 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm27 & (zmm7 ^ zmm14)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm1 & (zmm15 ^ zmm11)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm1 & (zmm15 ^ zmm13)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3],xmm3[4],xmm11[5,6],xmm3[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm9 ^ (zmm26 & (zmm3 ^ zmm9)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm2 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm9 ^ (zmm28 & (zmm2 ^ zmm9)) +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm5 +; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4 ; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm24 & (zmm2 ^ zmm4)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm3)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm20 & (zmm4 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm1 & (ymm5 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm4 & (ymm6 ^ ymm20)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm5[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = (zmm0 & zmm4) | mem -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = (zmm4 & zmm5) | mem +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm5 & (ymm6 ^ ymm22)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm3[0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (zmm0 & zmm5) | mem +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = (zmm3 & zmm5) | mem ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] @@ -13789,9 +13767,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -13822,31 +13799,31 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm26 & (zmm6 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm28 & (zmm6 ^ zmm5)) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm5 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm26 & (zmm8 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm28 & (zmm8 ^ zmm5)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm5 & (zmm6 ^ zmm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm5 & (zmm8 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm5 & (zmm8 ^ zmm3)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (mem & (zmm4 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (mem & (zmm3 ^ zmm0)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm0 = zmm0 | (zmm1 & mem) ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm12 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm12 = zmm12 | (zmm1 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 448(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 704(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -13854,9 +13831,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512-FCP-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX512-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -14563,7 +14540,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $1544, %rsp # imm = 0x608 +; AVX512DQ-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 @@ -14590,48 +14567,48 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vpor %ymm15, %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 @@ -14649,32 +14626,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (mem & (zmm12 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm14) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm13) ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm11 & ymm16) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm6, %ymm11 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] @@ -14685,38 +14662,35 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm4 ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm14, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm13, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8,9,10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [151522058,0,421010202,421010202] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [151522058,0,421010202,421010202] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm24 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [218894094,0,488382238,488382238] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,2,3,10,9,11,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [218894094,0,488382238,488382238] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm11 & (zmm1 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm23 & (zmm1 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 @@ -14736,35 +14710,34 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,3,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm11 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm23 & (zmm0 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] @@ -14772,70 +14745,68 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vprold $16, %ymm10, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm16 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,13,8,8,15,14,8,15,14,13,8,8,15,14,8,15] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm18 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (mem & (zmm5 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm17 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm31 & (zmm3 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm15 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm25, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm25 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm13 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm29[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [8,9,9,0,0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [8,9,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm7 @@ -14843,9 +14814,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,1,1,8,8,0,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] @@ -14856,61 +14827,60 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm1 ; AVX512DQ-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6 ; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm30 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm20 & (zmm30 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm1 & (zmm30 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vprold $16, %xmm13, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm3 & (zmm8 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm30 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm11 & (zmm30 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm1 & (zmm30 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm29 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm20 & (zmm29 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ zmm8)) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -14921,196 +14891,191 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,2,2,3,8,9,9,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7,8,9,10],ymm5[11],ymm8[12,13],ymm5[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm3 ^ (mem & (zmm1 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm3 ^ (zmm31 & (zmm1 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vprold $16, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 ^ (zmm17 & (zmm8 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 ^ (zmm23 & (zmm8 ^ zmm12)) ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm23, %zmm16 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm11)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm25 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm26 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm14 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm17 & (zmm14 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm23 & (zmm14 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm19 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm24 & (zmm19 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm27 & (zmm19 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm31 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,1,8,8,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm22 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm0 ^ (zmm26 & (zmm11 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,8,9,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm11 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm21 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6],xmm13[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm28 & (zmm13 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm24 & (ymm20 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm22 = ymm22 ^ (ymm23 & (ymm22 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [8,9,9,0,0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [8,9,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,0,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm31 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm7 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm24 & (zmm7 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1 ; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm24 & (zmm15 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm20 & (zmm15 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm27 & (zmm7 ^ zmm14)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm1 & (zmm15 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm1 & (zmm15 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3],xmm3[4],xmm11[5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm9 ^ (zmm26 & (zmm3 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm9 ^ (zmm28 & (zmm2 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm5 +; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4 ; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm24 & (zmm2 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm20 & (zmm4 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm1 & (ymm5 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm4 & (ymm6 ^ ymm20)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm5[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = (zmm0 & zmm4) | mem -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm4 = (zmm4 & zmm5) | mem +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm5 & (ymm6 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (zmm0 & zmm5) | mem +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (zmm3 & zmm5) | mem ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] @@ -15123,9 +15088,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -15156,31 +15120,31 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm26 & (zmm6 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm28 & (zmm6 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm5 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm26 & (zmm8 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm28 & (zmm8 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm5 & (zmm6 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm5 & (zmm8 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm5 & (zmm8 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (mem & (zmm4 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (mem & (zmm3 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm0 = zmm0 | (zmm1 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm12 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm12 = zmm12 | (zmm1 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 704(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -15188,15 +15152,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512DQ-FCP-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX512DQ-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $136, %rsp +; AVX512BW-NEXT: subq $72, %rsp ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 @@ -15224,7 +15188,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] ; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15238,7 +15202,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -15252,7 +15216,6 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 @@ -15281,7 +15244,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 @@ -15392,10 +15356,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 @@ -15416,13 +15380,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512BW-NEXT: addq $136, %rsp +; AVX512BW-NEXT: addq $72, %rsp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i16_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $136, %rsp +; AVX512BW-FCP-NEXT: subq $72, %rsp ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 @@ -15450,7 +15414,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15464,7 +15428,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -15478,7 +15442,6 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 @@ -15507,7 +15470,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 @@ -15618,10 +15582,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 @@ -15642,13 +15606,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512BW-FCP-NEXT: addq $136, %rsp +; AVX512BW-FCP-NEXT: addq $72, %rsp ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i16_stride7_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $136, %rsp +; AVX512DQ-BW-NEXT: subq $72, %rsp ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29 @@ -15676,7 +15640,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15690,7 +15654,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -15704,7 +15668,6 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 @@ -15733,7 +15696,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 @@ -15844,10 +15808,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 @@ -15868,13 +15832,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512DQ-BW-NEXT: addq $136, %rsp +; AVX512DQ-BW-NEXT: addq $72, %rsp ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i16_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $136, %rsp +; AVX512DQ-BW-FCP-NEXT: subq $72, %rsp ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29 @@ -15902,7 +15866,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15916,7 +15880,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -15930,7 +15894,6 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 @@ -15959,7 +15922,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 @@ -16070,10 +16034,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 @@ -16094,7 +16058,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $136, %rsp +; AVX512DQ-BW-FCP-NEXT: addq $72, %rsp ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 9c9dca82f60ca..4916bd88cf51a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -636,8 +636,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15] @@ -646,8 +645,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm0 @@ -723,8 +721,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15] @@ -733,8 +730,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 89330122fa239..f86e66f2aeb92 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -1840,53 +1840,43 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] @@ -1906,53 +1896,43 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] @@ -1972,53 +1952,43 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] @@ -2038,53 +2008,43 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] @@ -2104,53 +2064,43 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] @@ -2170,53 +2120,43 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] @@ -2236,53 +2176,43 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] @@ -2302,53 +2232,43 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] @@ -3907,12 +3827,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 ; AVX512-NEXT: movw $12684, %ax # imm = 0x318C @@ -3920,12 +3838,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm9 ; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 ; AVX512-NEXT: movw $25368, %ax # imm = 0x6318 @@ -3933,34 +3849,28 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm29 ; AVX512-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512-NEXT: movw $6342, %ax # imm = 0x18C6 @@ -4013,12 +3923,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 ; AVX512-FCP-NEXT: movw $12684, %ax # imm = 0x318C @@ -4026,12 +3934,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 ; AVX512-FCP-NEXT: movw $25368, %ax # imm = 0x6318 @@ -4039,34 +3945,28 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 @@ -4119,12 +4019,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 ; AVX512DQ-NEXT: movw $12684, %ax # imm = 0x318C @@ -4132,12 +4030,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 ; AVX512DQ-NEXT: movw $25368, %ax # imm = 0x6318 @@ -4145,34 +4041,28 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512DQ-NEXT: movw $6342, %ax # imm = 0x18C6 @@ -4225,12 +4115,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 ; AVX512DQ-FCP-NEXT: movw $12684, %ax # imm = 0x318C @@ -4238,12 +4126,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: movw $25368, %ax # imm = 0x6318 @@ -4251,34 +4137,28 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512DQ-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 @@ -4331,12 +4211,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C @@ -4344,12 +4222,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 @@ -4357,34 +4233,28 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 @@ -4437,12 +4307,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C @@ -4450,12 +4318,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 @@ -4463,34 +4329,28 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 @@ -4543,12 +4403,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: movw $12684, %ax # imm = 0x318C @@ -4556,12 +4414,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: movw $25368, %ax # imm = 0x6318 @@ -4569,34 +4425,28 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512DQ-BW-NEXT: movw $6342, %ax # imm = 0x18C6 @@ -4649,12 +4499,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C @@ -4662,12 +4510,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 @@ -4675,34 +4521,28 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 @@ -7996,213 +7836,202 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-LABEL: store_i32_stride5_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm18 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm23 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm21 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm24 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-NEXT: vpermt2d %zmm18, %zmm14, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512-NEXT: vpermt2d %zmm23, %zmm10, %zmm31 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512-NEXT: vpermt2d %zmm0, %zmm13, %zmm27 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm30, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm0, %zmm28, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512-NEXT: vpermt2d %zmm29, %zmm10, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512-NEXT: vpermt2d %zmm24, %zmm13, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512-NEXT: vpermt2d %zmm25, %zmm30, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm16 ; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm10 ; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 ; AVX512-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512-NEXT: vpermt2d %zmm20, %zmm8, %zmm21 +; AVX512-NEXT: vpermt2d %zmm18, %zmm15, %zmm20 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512-NEXT: vpermt2d %zmm17, %zmm8, %zmm30 +; AVX512-NEXT: vpermt2d %zmm0, %zmm15, %zmm17 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512-NEXT: vpermt2d %zmm3, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512-NEXT: vpermt2d %zmm2, %zmm14, %zmm5 +; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2d %zmm2, %zmm28, %zmm9 +; AVX512-NEXT: vpermi2d %zmm3, %zmm25, %zmm14 +; AVX512-NEXT: vpermi2d %zmm3, %zmm25, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm3, %zmm25, %zmm28 +; AVX512-NEXT: vpermt2d %zmm25, %zmm8, %zmm3 ; AVX512-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm8 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm16, %zmm27 {%k2} ; AVX512-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-NEXT: kmovw %eax, %k3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm21 {%k3} +; AVX512-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm23 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm20 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm29, %zmm17 {%k1} ; AVX512-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm22 +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm19 ; AVX512-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm17 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm6, %zmm24 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} ; AVX512-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 +; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm0 +; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512-NEXT: vpermt2d %zmm6, %zmm15, %zmm24 +; AVX512-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512-NEXT: vpermt2d %zmm6, %zmm25, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1} +; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm10 ; AVX512-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} ; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm3 {%k3} +; AVX512-NEXT: vpermt2d %zmm2, %zmm25, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512-NEXT: vmovdqa64 %zmm26, 448(%r9) ; AVX512-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512-NEXT: vmovdqa64 %zmm19, 640(%r9) +; AVX512-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512-NEXT: vmovdqa64 %zmm17, 640(%r9) ; AVX512-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512-NEXT: vmovdqa64 %zmm19, 768(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, 832(%r9) ; AVX512-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512-NEXT: vmovdqa64 %zmm20, 960(%r9) ; AVX512-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512-NEXT: vmovdqa64 %zmm27, 1088(%r9) ; AVX512-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm23, 1216(%r9) +; AVX512-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -8210,213 +8039,202 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-LABEL: store_i32_stride5_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm14, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm10, %zmm31 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm27 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm10 ; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm21 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm15, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm8, %zmm30 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm15, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm9 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm8, %zmm3 ; AVX512-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm27 {%k2} ; AVX512-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-FCP-NEXT: kmovw %eax, %k3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm21 {%k3} +; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm29, %zmm17 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm24 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm3 {%k3} +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 448(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 640(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 640(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 768(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 832(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 960(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 1088(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 1216(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -8424,213 +8242,202 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-LABEL: store_i32_stride5_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm21 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm24 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm14, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm10, %zmm31 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm13, %zmm27 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm30, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm28, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm10, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm13, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm10, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm30, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm10 ; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm8, %zmm21 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm15, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm14, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm8, %zmm30 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm15, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm15, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm14, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm28, %zmm9 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm25, %zmm14 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm25, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm25, %zmm28 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm8, %zmm3 ; AVX512DQ-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512DQ-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm27 {%k2} ; AVX512DQ-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm21 {%k3} +; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm23 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm20 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512DQ-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm29, %zmm17 {%k1} ; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm24 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm15, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm25, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm10 ; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm3 {%k3} +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm25, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 448(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 640(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 640(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 768(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 832(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 960(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 1088(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1216(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -8638,213 +8445,202 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-LABEL: store_i32_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm10, %zmm31 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm27 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm15, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm8, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm15, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm21 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm29, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm3 {%k3} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 448(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 640(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 640(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 768(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 832(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 960(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 1088(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 1216(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -8852,213 +8648,202 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-LABEL: store_i32_stride5_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm10, %zmm31 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm27 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm13, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm15, %zmm20 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm8, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm15, %zmm7 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm25, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm25, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm8, %zmm3 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm27 {%k2} ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm21 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm17 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k3} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 640(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 832(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 960(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 1088(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -9066,213 +8851,202 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-LABEL: store_i32_stride5_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm10, %zmm31 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm27 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm15, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm8, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm15, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm27 {%k2} ; AVX512BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm21 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm24 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm3 {%k3} +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 448(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 640(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 768(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 832(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 960(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 1088(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 1216(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512BW-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -9280,213 +9054,202 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-LABEL: store_i32_stride5_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm14, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm10, %zmm31 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm27 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm13, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm15, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm8, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm15, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm25, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm25, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm27 {%k2} ; AVX512DQ-BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm21 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm23 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm29, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k3} +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 448(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 640(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 640(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 768(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 832(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 960(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 1088(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 1216(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-BW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -9494,213 +9257,202 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-LABEL: store_i32_stride5_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,1,17,0,0,0,2,18,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,16,0,0,0,1,17,0,0,0,2,18,0,0,0,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,13,29,0,0,0,14,30,0,0,0,15,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,13,29,0,0,0,14,30,0,0,0,15,31,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,10,26,0,0,0,11,27,0,0,0,12,28,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [9,0,0,0,26,10,0,0,0,27,11,0,0,0,28,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm10, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [6,22,0,0,0,7,23,0,0,0,8,24,0,0,0,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,3,19,0,0,0,4,20,0,0,0,5,21,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,7,23,0,0,0,8,24,0,0,0,9,25,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,0,0,0,20,4,0,0,0,21,5,0,0,0,22,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm15, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm8, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm15, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm25, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm21 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm24 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm3 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 448(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 640(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 768(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 832(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 960(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 1088(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 1216(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-BW-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 78b07e5671e5a..cb7101cebe04d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -2198,8 +2198,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] @@ -2213,19 +2212,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 ; AVX512-NEXT: movb $-110, %cl ; AVX512-NEXT: kmovw %ecx, %k2 @@ -2234,19 +2230,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] @@ -2255,8 +2248,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] @@ -2282,11 +2274,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 ; AVX512-FCP-NEXT: movb $-110, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 @@ -2297,8 +2287,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 ; AVX512-FCP-NEXT: movb $36, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 @@ -2310,19 +2299,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] @@ -2331,8 +2317,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] @@ -2342,8 +2327,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] @@ -2368,8 +2352,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] @@ -2383,19 +2366,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 ; AVX512DQ-NEXT: movb $-110, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 @@ -2404,19 +2384,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] @@ -2425,8 +2402,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] @@ -2452,11 +2428,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: movb $-110, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 @@ -2467,8 +2441,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: movb $36, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 @@ -2480,19 +2453,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] @@ -2501,8 +2471,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] @@ -2512,8 +2481,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] @@ -2538,8 +2506,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] @@ -2553,19 +2520,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 ; AVX512BW-NEXT: movb $-110, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 @@ -2574,19 +2538,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] @@ -2595,8 +2556,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] @@ -2622,11 +2582,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: movb $-110, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 @@ -2637,8 +2595,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: movb $36, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -2650,19 +2607,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] @@ -2671,8 +2625,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] @@ -2682,8 +2635,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] @@ -2708,8 +2660,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] @@ -2723,19 +2674,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: movb $-110, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -2744,19 +2692,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] @@ -2765,8 +2710,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] @@ -2792,11 +2736,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $-110, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -2807,8 +2749,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $36, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -2820,19 +2761,16 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] @@ -2841,8 +2779,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] @@ -2852,8 +2789,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] @@ -4768,18 +4704,14 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 ; AVX512-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 ; AVX512-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 @@ -4811,8 +4743,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512-NEXT: movb $-110, %cl @@ -4822,8 +4753,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} @@ -4843,8 +4773,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} ; AVX512-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 ; AVX512-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm21 @@ -4860,8 +4789,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] ; AVX512-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 ; AVX512-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] @@ -4898,39 +4826,32 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 @@ -4948,8 +4869,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 ; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 @@ -5043,18 +4963,14 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 ; AVX512DQ-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 @@ -5086,8 +5002,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-NEXT: movb $-110, %cl @@ -5097,8 +5012,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} @@ -5118,8 +5032,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm21 @@ -5135,8 +5048,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] @@ -5173,39 +5085,32 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 @@ -5223,8 +5128,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 @@ -5318,18 +5222,14 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 ; AVX512BW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 @@ -5361,8 +5261,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512BW-NEXT: movb $-110, %cl @@ -5372,8 +5271,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} @@ -5393,8 +5291,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm21 @@ -5410,8 +5307,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] @@ -5448,39 +5344,32 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 @@ -5498,8 +5387,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 @@ -5593,18 +5481,14 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 @@ -5636,8 +5520,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: movb $-110, %cl @@ -5647,8 +5530,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} @@ -5668,8 +5550,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm21 @@ -5685,8 +5566,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] @@ -5723,39 +5603,32 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 @@ -5773,8 +5646,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 @@ -9830,18 +9702,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm16 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm20 ; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] @@ -9864,12 +9733,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm19 ; AVX512-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 @@ -9918,22 +9785,19 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm6 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 ; AVX512-NEXT: movb $-110, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} @@ -10080,7 +9944,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-LABEL: store_i32_stride6_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 @@ -10088,43 +9952,36 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm19, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10132,14 +9989,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -10150,231 +10007,226 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm13, %zmm28 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm24 ; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm3, %zmm18 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm31, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512-FCP-NEXT: movb $-110, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512-FCP-NEXT: movb $36, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm31 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm27 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm17 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm19 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm20 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm24 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm31 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm30 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm29 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1344(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 1344(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1280(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1088(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 896(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 832(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 768(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 704(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 576(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512-FCP-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512-FCP-NEXT: vzeroupper @@ -10397,18 +10249,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] @@ -10431,12 +10280,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 @@ -10485,22 +10332,19 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 ; AVX512DQ-NEXT: movb $-110, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} @@ -10647,7 +10491,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-LABEL: store_i32_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 @@ -10655,43 +10499,36 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm19, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10699,14 +10536,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -10717,231 +10554,226 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm13, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm24 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm3, %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm31, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: movb $-110, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQ-FCP-NEXT: movb $36, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm31 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm17 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 1344(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 1344(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 1280(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 1088(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 832(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 704(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 576(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-FCP-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512DQ-FCP-NEXT: vzeroupper @@ -10964,18 +10796,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] @@ -10998,12 +10827,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 @@ -11052,22 +10879,19 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 ; AVX512BW-NEXT: movb $-110, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} @@ -11214,7 +11038,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-LABEL: store_i32_stride6_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 @@ -11222,43 +11046,36 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm19, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11266,14 +11083,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -11284,231 +11101,226 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm13, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm24 ; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm3, %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm31, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512BW-FCP-NEXT: movb $-110, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: movb $36, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm31 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm17 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm24 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1344(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 1344(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1280(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1088(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 896(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 832(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 768(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 704(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 576(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-FCP-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512BW-FCP-NEXT: vzeroupper @@ -11531,18 +11343,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] @@ -11565,12 +11374,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 @@ -11619,22 +11426,19 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 ; AVX512DQ-BW-NEXT: movb $-110, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} @@ -11781,7 +11585,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 @@ -11789,43 +11593,36 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,16,0,0,0,0,1,17,0,0,0,0,2,18] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm19, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,16,0,0,0,0,1,17,0,0,0,0,2,18,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,3,19,0,0,0,0,4,20,0,0,0,0,5,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,24,0,0,0,0,9,25,0,0,0,0,10,26,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,11,27,0,0,0,0,12,28,0,0,0,0,13,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11833,14 +11630,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -11851,231 +11648,226 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm13, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [5,21,0,0,0,0,6,22,0,0,0,0,7,23,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,8,24,0,0,0,0,9,25,0,0,0,0,10,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm3, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [13,29,0,0,0,0,14,30,0,0,0,0,15,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm31, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movb $-110, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $36, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm31 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 1344(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 1344(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 1280(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 1088(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 896(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 832(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 704(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 576(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-BW-FCP-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512DQ-BW-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index f41123c5c3cfd..0199177a94b6b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -1033,8 +1033,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,13,0,0,0] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 @@ -1043,8 +1042,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,15,0,0,0] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 @@ -1092,8 +1090,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,13,0,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 @@ -1102,8 +1099,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,15,0,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 @@ -1151,8 +1147,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,13,0,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 @@ -1161,8 +1156,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,15,0,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 @@ -1210,8 +1204,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,13,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 @@ -1220,8 +1213,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,15,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 @@ -1269,8 +1261,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,13,0,0,0] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 @@ -1279,8 +1270,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,15,0,0,0] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 @@ -1328,8 +1318,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,13,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 @@ -1338,8 +1327,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,15,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 @@ -1387,8 +1375,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,13,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 @@ -1397,8 +1384,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,15,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 @@ -1446,8 +1432,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,5,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 @@ -1456,8 +1441,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,7,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 @@ -2343,8 +2327,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,7,15,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] @@ -2355,8 +2338,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] -; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,13,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] @@ -2442,8 +2424,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,7,15,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] @@ -2454,8 +2435,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,13,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] @@ -2541,8 +2521,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,7,15,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] @@ -2553,8 +2532,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,13,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] @@ -2640,8 +2618,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,7,15,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] @@ -2652,8 +2629,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,13,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] @@ -2739,8 +2715,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,7,15,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] @@ -2751,8 +2726,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,13,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] @@ -2838,8 +2812,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,7,15,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] @@ -2850,8 +2823,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,13,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] @@ -2937,8 +2909,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,7,15,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] @@ -2949,8 +2920,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,13,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] @@ -3036,8 +3006,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,7,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] @@ -3048,8 +3017,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,5,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] @@ -4986,16 +4954,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,7,15,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,5,13,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5188,16 +5154,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,7,15,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,5,13,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5390,16 +5354,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,7,15,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,5,13,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5592,16 +5554,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,7,15,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,5,13,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5794,16 +5754,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,7,15,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,5,13,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5996,16 +5954,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,7,15,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,5,13,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6198,16 +6154,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,7,15,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,5,13,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6400,16 +6354,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,7,15,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,5,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10527,128 +10479,127 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm1 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,7,15,0,0,0] +; AVX512-NEXT: vpermt2q %zmm5, %zmm4, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] @@ -10660,8 +10611,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,5,13,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] @@ -10990,128 +10940,127 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm1 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,7,15,0,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] @@ -11123,8 +11072,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,5,13,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] @@ -11453,128 +11401,127 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm1 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,7,15,0,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm4, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] @@ -11586,8 +11533,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,5,13,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] @@ -11916,128 +11862,127 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,7,15,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] @@ -12049,8 +11994,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,5,13,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] @@ -12379,128 +12323,127 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm1 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,7,15,0,0,0] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] @@ -12512,8 +12455,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,5,13,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] @@ -12842,128 +12784,127 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,7,15,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] @@ -12975,8 +12916,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,5,13,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] @@ -13305,128 +13245,127 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] @@ -13438,8 +13377,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,5,13,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] @@ -13768,128 +13706,127 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] @@ -13901,8 +13838,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,5,13,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index aac6a1bddd08a..fc6cc82ebdad4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -544,8 +544,7 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,3,7,11,15,0,0] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 @@ -572,8 +571,7 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,3,7,11,15,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 @@ -600,8 +598,7 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,3,7,11,15,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 @@ -628,8 +625,7 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,3,7,11,15,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 @@ -656,8 +652,7 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,3,7,11,15,0,0] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 @@ -684,8 +679,7 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,3,7,11,15,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 @@ -712,8 +706,7 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,3,7,11,15,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 @@ -740,8 +733,7 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,3,7,11,15,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 @@ -1228,672 +1220,616 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-LABEL: store_i64_stride6_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,12] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: movb $12, %r10b ; AVX512-NEXT: kmovw %r10d, %k1 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512-NEXT: movb $16, %r10b ; AVX512-NEXT: kmovw %r10d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,9,0,0,0,0,2,10] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: movb $48, %r9b ; AVX512-NEXT: kmovw %r9d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,9,0,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,13,0,0,0,0,6,14] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,13,0,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,0,1,9] +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] -; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,0,2,3,4,5,11,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 +; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride6_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,12] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512-FCP-NEXT: movb $12, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512-FCP-NEXT: movb $16, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,9,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: movb $48, %r9b ; AVX512-FCP-NEXT: kmovw %r9d, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,9,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,13,0,0,0,0,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,13,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,0,2,3,4,5,11,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride6_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,12] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-NEXT: movb $12, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-NEXT: movb $16, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,9,0,0,0,0,2,10] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: movb $48, %r9b ; AVX512DQ-NEXT: kmovw %r9d, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,9,0,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,13,0,0,0,0,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,13,0,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,0,1,9] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,0,2,3,4,5,11,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride6_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: movb $12, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: movb $16, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,9,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: movb $48, %r9b ; AVX512DQ-FCP-NEXT: kmovw %r9d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,9,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,13,0,0,0,0,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,13,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,0,2,3,4,5,11,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,12] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: movb $12, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512BW-NEXT: movb $16, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,9,0,0,0,0,2,10] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,9,0,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,13,0,0,0,0,6,14] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,13,0,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,0,1,9] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,0,2,3,4,5,11,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride6_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: movb $12, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: movb $16, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,9,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: movb $48, %r9b ; AVX512BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,9,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,13,0,0,0,0,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,13,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,0,2,3,4,5,11,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride6_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: movb $12, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: movb $16, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,9,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: movb $48, %r9b ; AVX512DQ-BW-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,13,0,0,0,0,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride6_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $12, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $16, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,9,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $48, %r9b ; AVX512DQ-BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,13,0,0,0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -2968,20 +2904,19 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-LABEL: store_i64_stride6_vf16: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 @@ -2991,135 +2926,127 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $16, %r10b ; AVX512-NEXT: kmovw %r10d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 -; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} ; AVX512-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm14 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] -; AVX512-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,13,0,0,0,0,6,14] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512-NEXT: movb $48, %r9b ; AVX512-NEXT: kmovw %r9d, %k2 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 -; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 -; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 -; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] -; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,13,0,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,9,0,0,0,0,2,10] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,9,0,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,8,0,0,0,0,1,9] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} -; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] -; AVX512-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 -; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,7,15] +; AVX512-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [14,0,2,3,4,5,15,0] +; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [10,0,2,3,4,5,11,0] +; AVX512-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride6_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 @@ -3129,135 +3056,127 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $16, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm14 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,13,0,0,0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512-FCP-NEXT: movb $48, %r9b ; AVX512-FCP-NEXT: kmovw %r9d, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] -; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,13,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,9,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,9,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,8,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [14,0,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [10,0,2,3,4,5,11,0] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride6_vf16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 @@ -3267,135 +3186,127 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $16, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm14 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,13,0,0,0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512DQ-NEXT: movb $48, %r9b ; AVX512DQ-NEXT: kmovw %r9d, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] -; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,13,0,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,9,0,0,0,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,9,0,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,8,0,0,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,7,15] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [14,0,2,3,4,5,15,0] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [10,0,2,3,4,5,11,0] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride6_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 @@ -3405,135 +3316,127 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $16, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm14 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,13,0,0,0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512DQ-FCP-NEXT: movb $48, %r9b ; AVX512DQ-FCP-NEXT: kmovw %r9d, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,13,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,9,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,9,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,8,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [14,0,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [10,0,2,3,4,5,11,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 @@ -3543,135 +3446,127 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $16, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,13,0,0,0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,13,0,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,9,0,0,0,0,2,10] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,9,0,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,8,0,0,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,7,15] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [14,0,2,3,4,5,15,0] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [10,0,2,3,4,5,11,0] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride6_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 @@ -3681,135 +3576,127 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $16, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,13,0,0,0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512BW-FCP-NEXT: movb $48, %r9b ; AVX512BW-FCP-NEXT: kmovd %r9d, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,13,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,9,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,9,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,8,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [14,0,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [10,0,2,3,4,5,11,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride6_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 @@ -3819,135 +3706,127 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $16, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,13,0,0,0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512DQ-BW-NEXT: movb $48, %r9b ; AVX512DQ-BW-NEXT: kmovd %r9d, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] -; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,9,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,8,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride6_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,12,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 @@ -3957,115 +3836,108 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $16, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,13,0,0,0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $48, %r9b ; AVX512DQ-BW-FCP-NEXT: kmovd %r9d, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [1,9,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,8,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -6364,2217 +6236,2153 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride6_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 +; AVX512-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm29 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [4,12,0,0,0,0,5,13] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [1,9,0,0,0,0,2,10] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [5,13,0,0,0,0,6,14] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 -; AVX512-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 -; AVX512-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 -; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm24 -; AVX512-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,1,9] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm19 = [0,0,7,15] +; AVX512-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $12, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512-NEXT: movb $48, %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} -; AVX512-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 -; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 -; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 -; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm17 -; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 -; AVX512-NEXT: vmovdqa64 128(%r9), %zmm30 -; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 -; AVX512-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 -; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 -; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 -; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 -; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 -; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 -; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] -; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 -; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 -; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 -; AVX512-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 -; AVX512-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 -; AVX512-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 -; AVX512-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 -; AVX512-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,9,0,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,13,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: movb $16, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] -; AVX512-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [10,0,2,3,4,5,11,0] +; AVX512-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [14,0,2,3,4,5,15,0] +; AVX512-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} -; AVX512-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 -; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 -; AVX512-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 -; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 -; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 1408(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 1280(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512-NEXT: vmovdqa64 %zmm28, 960(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 960(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 832(%rax) -; AVX512-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 1152(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride6_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 +; AVX512-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [4,12,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [1,9,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [5,13,0,0,0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] -; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512-FCP-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $12, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512-FCP-NEXT: movb $48, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 -; AVX512-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,9,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,13,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: movb $16, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [10,0,2,3,4,5,11,0] +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [14,0,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1408(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1280(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 960(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 960(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride6_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 +; AVX512DQ-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [4,12,0,0,0,0,5,13] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [1,9,0,0,0,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [5,13,0,0,0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] -; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm19 = [0,0,7,15] +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $12, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQ-NEXT: movb $48, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 -; AVX512DQ-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512DQ-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,9,0,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,13,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQ-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQ-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: movb $16, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [10,0,2,3,4,5,11,0] +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [14,0,2,3,4,5,15,0] +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1408(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1280(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 960(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 960(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 832(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1152(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512DQ-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [4,12,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [1,9,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [5,13,0,0,0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $12, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: movb $48, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,9,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,13,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: movb $16, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [10,0,2,3,4,5,11,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [14,0,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 1408(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 1280(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 960(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512DQ-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 +; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [4,12,0,0,0,0,5,13] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [1,9,0,0,0,0,2,10] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [5,13,0,0,0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm19 = [0,0,7,15] +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $12, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512BW-NEXT: movb $48, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 -; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,9,0,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,13,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: movb $16, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [10,0,2,3,4,5,11,0] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [14,0,2,3,4,5,15,0] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride6_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [4,12,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [1,9,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [5,13,0,0,0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] -; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $12, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: movb $48, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,9,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,13,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: movb $16, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [10,0,2,3,4,5,11,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [14,0,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1408(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1280(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 960(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 960(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512BW-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride6_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [4,12,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [1,9,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [5,13,0,0,0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] -; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm19 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $12, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: movb $48, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $16, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 1408(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 960(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 960(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 832(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 1152(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512DQ-BW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride6_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [4,12,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [1,9,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [5,13,0,0,0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $12, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $48, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: movb $16, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 1408(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 1280(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 960(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 960(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512DQ-BW-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -13389,5545 +13197,5329 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride6_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm29 -; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm28 -; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm27 -; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm13 -; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm24 -; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,9,0,0,0,0,2,10] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [5,13,0,0,0,0,6,14] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm15 -; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm13 -; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm16 -; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 +; AVX512-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [4,12,0,0,0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,0,1,9] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm23 = [0,0,7,15] +; AVX512-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $12, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} ; AVX512-NEXT: movb $48, %al ; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,9,0,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,13,0,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: movb $16, %al +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] +; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,0,2,3,4,5,15,0] +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512-NEXT: vmovdqa64 448(%r8), %zmm10 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%r9), %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 256(%r9), %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 -; AVX512-NEXT: movb $16, %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%r9), %zmm11 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%r9), %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%r9), %zmm22 -; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa 256(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa 320(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 -; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 3008(%rax) -; AVX512-NEXT: vmovdqa64 %zmm28, 2944(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 2816(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 2624(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 2240(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 2048(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 1856(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 1664(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 1088(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 896(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, (%rax) +; AVX512-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride6_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,9,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [5,13,0,0,0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [4,12,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512-FCP-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm23 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $12, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} ; AVX512-FCP-NEXT: movb $48, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,9,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 -; AVX512-FCP-NEXT: movb $16, %al -; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,13,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: movb $16, %al +; AVX512-FCP-NEXT: kmovw %eax, %k2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,0,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 3008(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 2944(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 2816(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 2624(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 2240(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 2048(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 1856(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 1664(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 1088(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 896(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512-FCP-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride6_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512DQ-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,9,0,0,0,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [5,13,0,0,0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: movb $12, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-NEXT: movb $48, %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [4,12,0,0,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm23 = [0,0,7,15] +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: movb $12, %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 -; AVX512DQ-NEXT: movb $16, %al +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQ-NEXT: movb $48, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,9,0,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} +; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,13,0,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: movb $16, %al +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,0,2,3,4,5,15,0] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa 256(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa 320(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQ-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 -; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512DQ-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512DQ-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512DQ-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512DQ-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512DQ-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQ-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQ-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQ-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 3008(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 2944(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 2816(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 2624(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 2240(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 2048(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1856(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 1664(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1088(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 896(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,9,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [5,13,0,0,0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [4,12,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm23 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $12, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: movb $48, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: movb $16, %al +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: movb $48, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,9,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,13,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: movb $16, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,0,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 3008(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 2944(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 2816(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2624(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2240(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 2048(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 1856(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 1664(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 1088(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-FCP-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,9,0,0,0,0,2,10] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [5,13,0,0,0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm15 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm16 -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [4,12,0,0,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm23 = [0,0,7,15] +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $12, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-NEXT: movb $48, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 -; AVX512BW-NEXT: movb $16, %al +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512BW-NEXT: movb $48, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,9,0,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,13,0,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: movb $16, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,0,2,3,4,5,15,0] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 3008(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 2944(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 2816(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2624(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2240(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 2048(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 1856(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1088(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 896(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512BW-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512BW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride6_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,9,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [5,13,0,0,0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: movb $12, %al -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: movb $48, %al -; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [4,12,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-FCP-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm23 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: movb $12, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: movb $16, %al +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: movb $48, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,9,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,13,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: movb $16, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,0,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 3008(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 2944(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 2816(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2624(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2240(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 2048(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 1856(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 1664(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 1088(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 896(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 768(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512BW-FCP-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride6_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,9,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [5,13,0,0,0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: movb $12, %al -; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: movb $48, %al -; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [4,12,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm23 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: movb $16, %al -; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: movb $12, %al +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: movb $48, %al +; AVX512DQ-BW-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: movb $16, %al +; AVX512DQ-BW-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 3008(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 2944(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 2816(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2624(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2240(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 2048(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 1856(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 1664(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 1088(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 896(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 768(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-BW-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,9,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [5,13,0,0,0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [4,12,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm23 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $12, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: movb $48, %al -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: movb $16, %al +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: movb $48, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: movb $16, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 3008(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 2944(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 2816(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2624(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2240(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 2048(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 1856(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 1664(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 1472(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 1088(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 896(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 768(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index d1fd4a360036b..3b7945af08147 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -627,24 +627,21 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512-NEXT: movb $112, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,5,9,0,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512-NEXT: movb $56, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,6,10,0,0,0] ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 @@ -672,24 +669,21 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512-FCP-NEXT: movb $112, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,5,9,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512-FCP-NEXT: movb $56, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,6,10,0,0,0] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 @@ -717,24 +711,21 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-NEXT: movb $112, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,5,9,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512DQ-NEXT: movb $56, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,6,10,0,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 @@ -762,24 +753,21 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: movb $112, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,5,9,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: movb $56, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,6,10,0,0,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 @@ -807,24 +795,21 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512BW-NEXT: movb $112, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,5,9,0,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,6,10,0,0,0] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 @@ -852,24 +837,21 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: movb $112, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,5,9,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: movb $56, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,6,10,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 @@ -897,24 +879,21 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: movb $112, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,5,9,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: movb $56, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,6,10,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 @@ -942,24 +921,21 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movb $112, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,5,9,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $56, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,2,6,10,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 @@ -1548,122 +1524,108 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512-NEXT: vmovdqa64 (%r10), %zmm1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm7, %zmm8, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm8, %zmm7, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512-NEXT: vpermt2q %zmm7, %zmm3, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm9 -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm14 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 -; AVX512-NEXT: movb $48, %sil -; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm5[0],zmm6[0],zmm5[2],zmm6[2],zmm5[4],zmm6[4],zmm5[6],zmm6[6] -; AVX512-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512-NEXT: vmovdqa64 (%r10), %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,15,0] +; AVX512-NEXT: vpermi2q %zmm8, %zmm7, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,7,15,0,0,0] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,7,15,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: movb $24, %sil ; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512-NEXT: movb $96, %sil ; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,0,0,0,1] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512-NEXT: movb $12, %sil ; AVX512-NEXT: kmovw %esi, %k2 -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm8 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512-NEXT: vinserti32x4 $3, (%r10), %zmm5, %zmm5 +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm5 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 +; AVX512-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm9 ; AVX512-NEXT: movb $112, %sil ; AVX512-NEXT: kmovw %esi, %k2 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 {%k2} -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,0,14,6,5,0,14,6] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,0,0,0,0,14,6] +; AVX512-NEXT: vpermi2q %zmm7, %zmm8, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,13,0,0,0,0,6,7] +; AVX512-NEXT: vpermi2q %zmm6, %zmm9, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,6,14] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] ; AVX512-NEXT: movb $-61, %sil ; AVX512-NEXT: kmovw %esi, %k2 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa (%r9), %ymm6 -; AVX512-NEXT: vmovdqa (%r8), %ymm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,3,11,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,0,0,0,3] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512-NEXT: vmovdqa (%r9), %ymm11 +; AVX512-NEXT: vmovdqa (%r8), %ymm12 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512-NEXT: movb $28, %sil ; AVX512-NEXT: kmovw %esi, %k2 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3] -; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm13[2,3,2,3],zmm6[2,3,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [1,0,0,0,0,0,10,2] +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] ; AVX512-NEXT: movb $6, %cl ; AVX512-NEXT: kmovw %ecx, %k2 -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,9,0,0,0] +; AVX512-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,3,4,9,0,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm14, %zmm15 ; AVX512-NEXT: movb $56, %cl ; AVX512-NEXT: kmovw %ecx, %k2 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,5,13,0] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,5,13,0,0,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [4,12,0,0,0,0,0,5] +; AVX512-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,12,0,0,0,0,7] +; AVX512-NEXT: vpermi2q %zmm6, %zmm14, %zmm7 ; AVX512-NEXT: movb $120, %cl ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,0,0,12,4] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512-NEXT: movb $48, %cl +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: movb $14, %cl ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1671,65 +1633,58 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,7,15,0] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,7,15,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,7,15,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm9 ; AVX512-FCP-NEXT: movb $24, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512-FCP-NEXT: movb $96, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,8,0,0,0,0,0,1] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm9 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512-FCP-NEXT: movb $12, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm9 ; AVX512-FCP-NEXT: movb $112, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,0,0,0,0,14,6] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,13,0,0,0,0,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] ; AVX512-FCP-NEXT: movb $-61, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [3,0,0,0,0,0,12,4] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 ; AVX512-FCP-NEXT: movb $48, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm12 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,3,7,0] @@ -1738,55 +1693,48 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movb $14, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm10 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm4, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,3,11,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,10,0,0,0,0,0,3] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512-FCP-NEXT: movb $28, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm0[2,3,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [1,0,0,0,0,0,10,2] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512-FCP-NEXT: movb $6, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,1,9,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,3,4,9,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm13 ; AVX512-FCP-NEXT: movb $56, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,5,13,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,5,13,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,0,0,0,5] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,0,0,0,0,7] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: movb $120, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1794,120 +1742,107 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,0,0,0,0,0,14,6] +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,13,0,0,0,0,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-NEXT: movb $-61, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,3,11,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,0,0,0,3] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 ; AVX512DQ-NEXT: movb $96, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm8 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX512DQ-NEXT: movb $28, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm11 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 -; AVX512DQ-NEXT: movb $48, %sil -; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm11[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,15,0] +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,7,15,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,7,15,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-NEXT: movb $24, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm13, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,0,0,10,2] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-NEXT: movb $6, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,1,9,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,3,4,9,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm13, %zmm14 ; AVX512DQ-NEXT: movb $56, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 {%k2} -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm12 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,0,0,0,1] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-NEXT: movb $12, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k2} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 ; AVX512DQ-NEXT: movb $112, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 -; AVX512DQ-NEXT: vinserti64x2 $3, (%r10), %zmm4, %zmm6 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-NEXT: vinserti64x2 $3, (%r10), %zmm14, %zmm13 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,5,13,0] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,5,13,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [4,12,0,0,0,0,0,5] +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,12,0,0,0,0,7] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm14, %zmm7 ; AVX512DQ-NEXT: movb $120, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,0,0,12,4] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-NEXT: movb $48, %cl +; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: movb $14, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1915,34 +1850,30 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [5,0,0,0,0,0,14,6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,0,0,0,0,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm9[0,1,2,3],zmm4[4,5,6,7] ; AVX512DQ-FCP-NEXT: movb $-61, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,0,0,12,4] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: movb $48, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,3,7,0] @@ -1951,85 +1882,75 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movb $14, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,3,11,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,10,0,0,0,0,0,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 ; AVX512DQ-FCP-NEXT: movb $96, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQ-FCP-NEXT: movb $28, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm12 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,7,15,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,7,15,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,7,15,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: movb $24, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [1,0,0,0,0,0,10,2] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm11 ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: movb $6, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,1,9,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,3,4,9,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: movb $56, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,0,0,0,0,1] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-FCP-NEXT: movb $12, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: movb $112, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,5,13,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,5,13,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,0,0,0,5] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,0,0,0,0,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: movb $120, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2037,122 +1958,108 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 -; AVX512BW-NEXT: movb $48, %sil -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm5[0],zmm6[0],zmm5[2],zmm6[2],zmm5[4],zmm6[4],zmm5[6],zmm6[6] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,7,15,0] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,7,15,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,7,15,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: movb $24, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512BW-NEXT: movb $96, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,0,0,0,1] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512BW-NEXT: movb $12, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vinserti32x4 $3, (%r10), %zmm5, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 +; AVX512BW-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm9 ; AVX512BW-NEXT: movb $112, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k2} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,0,14,6,5,0,14,6] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,0,0,0,0,14,6] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,13,0,0,0,0,6,7] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,6,14] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-NEXT: movb $-61, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%r9), %ymm6 -; AVX512BW-NEXT: vmovdqa (%r8), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,3,11,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,0,0,0,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa (%r9), %ymm11 +; AVX512BW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512BW-NEXT: movb $28, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm13[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [1,0,0,0,0,0,10,2] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-NEXT: movb $6, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,9,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,3,4,9,0,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm15 ; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,5,13,0] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,5,13,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [4,12,0,0,0,0,0,5] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,12,0,0,0,0,7] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm7 ; AVX512BW-NEXT: movb $120, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,0,0,12,4] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512BW-NEXT: movb $48, %cl +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: movb $14, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2160,65 +2067,58 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,7,15,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,7,15,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,7,15,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: movb $24, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: movb $96, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,8,0,0,0,0,0,1] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm9 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512BW-FCP-NEXT: movb $12, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm9 ; AVX512BW-FCP-NEXT: movb $112, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,0,0,0,0,14,6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,13,0,0,0,0,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-FCP-NEXT: movb $-61, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [3,0,0,0,0,0,12,4] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 ; AVX512BW-FCP-NEXT: movb $48, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm11 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm12 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,3,7,0] @@ -2227,55 +2127,48 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movb $14, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm4, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,3,11,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,10,0,0,0,0,0,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512BW-FCP-NEXT: movb $28, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [1,0,0,0,0,0,10,2] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-FCP-NEXT: movb $6, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,1,9,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,3,4,9,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: movb $56, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,5,13,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,5,13,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,0,0,0,5] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,0,0,0,0,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: movb $120, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 384(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2283,120 +2176,107 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $-61, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,0,0,0,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: movb $96, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm8 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX512DQ-BW-NEXT: movb $28, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 -; AVX512DQ-BW-NEXT: movb $48, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm11[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,0,7,15,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,7,15,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: movb $24, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm13, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,0,0,10,2] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $6, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: movb $56, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,0,0,0,1] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-NEXT: movb $12, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 ; AVX512DQ-BW-NEXT: movb $112, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%r10), %zmm4, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%r10), %zmm14, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,5,13,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,5,13,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm7 ; AVX512DQ-BW-NEXT: movb $120, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,0,0,12,4] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: movb $48, %cl +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: movb $14, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2404,34 +2284,30 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm9[0,1,2,3],zmm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: movb $-61, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,0,0,12,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $48, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,3,7,0] @@ -2440,85 +2316,75 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movb $14, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,10,0,0,0,0,0,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $96, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQ-BW-FCP-NEXT: movb $28, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movb $24, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [1,0,0,0,0,0,10,2] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: movb $6, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movb $56, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,0,0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-BW-FCP-NEXT: movb $12, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movb $112, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,5,13,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,5,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movb $120, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -3775,823 +3641,763 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-LABEL: store_i64_stride7_vf16: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm11 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 -; AVX512-NEXT: vpermi2q %zmm9, %zmm18, %zmm5 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm23 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512-NEXT: vpermt2q %zmm18, %zmm17, %zmm14 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm2 -; AVX512-NEXT: movb $48, %sil -; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6] -; AVX512-NEXT: vpermt2q %zmm9, %zmm16, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm7 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm9 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm16 = [0,7,15,0] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 +; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] ; AVX512-NEXT: movb $64, %sil ; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm31 -; AVX512-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512-NEXT: vpermt2q %zmm9, %zmm8, %zmm22 -; AVX512-NEXT: movb $24, %sil -; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] -; AVX512-NEXT: vpermi2q %zmm3, %zmm22, %zmm18 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] -; AVX512-NEXT: vpermi2q %zmm31, %zmm18, %zmm21 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] -; AVX512-NEXT: vpermi2q %zmm28, %zmm21, %zmm18 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] -; AVX512-NEXT: vpermi2q %zmm31, %zmm19, %zmm21 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm28, %zmm21, %zmm19 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512-NEXT: vpermt2q %zmm30, %zmm26, %zmm27 -; AVX512-NEXT: vpermi2q %zmm31, %zmm3, %zmm26 -; AVX512-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm22 +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,0,0,0,0,1] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 +; AVX512-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 ; AVX512-NEXT: movb $96, %sil +; AVX512-NEXT: kmovw %esi, %k1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [4,12,0,0,0,0,0,5] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,7,15,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 +; AVX512-NEXT: movb $24, %sil ; AVX512-NEXT: kmovw %esi, %k2 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 -; AVX512-NEXT: vpermi2q %zmm1, %zmm11, %zmm13 -; AVX512-NEXT: vpermt2q %zmm11, %zmm29, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512-NEXT: vpermt2q %zmm31, %zmm21, %zmm11 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm9, %zmm29, %zmm23 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,12,7,0,1,12,7] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm28, %zmm9, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm12 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [9,1,9,1,9,1,9,1] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm31, %zmm23, %zmm3 -; AVX512-NEXT: vmovdqa64 64(%r9), %ymm31 -; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 -; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm17 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 -; AVX512-NEXT: vpermi2q %zmm30, %zmm20, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm20, %zmm30, %zmm7 -; AVX512-NEXT: vpermi2q %zmm30, %zmm20, %zmm21 -; AVX512-NEXT: vpermt2q %zmm30, %zmm23, %zmm20 -; AVX512-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm23[0],ymm31[0],ymm23[2],ymm31[2] +; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,2,3,4,15,0,0] +; AVX512-NEXT: vpermi2q %zmm11, %zmm16, %zmm26 +; AVX512-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,12,0,0,0,0,7] +; AVX512-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,3,11,0] +; AVX512-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [2,10,0,0,0,0,0,3] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,9,0,0,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,7,15,0] +; AVX512-NEXT: vpermi2q %zmm25, %zmm14, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,0,0,0,0,0,14,6] +; AVX512-NEXT: vpermi2q %zmm14, %zmm25, %zmm1 +; AVX512-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 +; AVX512-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512-NEXT: vmovdqa 64(%r9), %ymm15 +; AVX512-NEXT: vpermt2q %zmm24, %zmm27, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,15,0] +; AVX512-NEXT: vpermi2q %zmm24, %zmm26, %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [13,0,2,3,4,5,6,14] +; AVX512-NEXT: vpermi2q %zmm24, %zmm23, %zmm26 +; AVX512-NEXT: vmovdqa64 64(%r8), %ymm24 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] ; AVX512-NEXT: movb $28, %sil ; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm30[2,3,2,3],zmm28[2,3,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm8, %zmm28, %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm28, %zmm7, %zmm8 -; AVX512-NEXT: vpermt2q %zmm28, %zmm9, %zmm21 -; AVX512-NEXT: vmovdqa (%r9), %ymm7 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm13 {%k2} -; AVX512-NEXT: vmovdqa (%r8), %ymm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k3} = zmm29[2,3,2,3],zmm28[2,3,2,3] -; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm20 -; AVX512-NEXT: vmovdqa64 (%rdx), %xmm28 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm28[0],mem[0] -; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm28 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [6,0,0,0,0,13,14,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm22, %zmm28 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,13,0,0,0,0,6,7] +; AVX512-NEXT: vpermi2q %zmm22, %zmm1, %zmm27 +; AVX512-NEXT: vmovdqa (%r9), %ymm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 +; AVX512-NEXT: vmovdqa (%r8), %ymm1 +; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,3,4,9,0,0] +; AVX512-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,5,13,0] +; AVX512-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm30, %zmm26, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512-NEXT: vpermt2q %zmm3, %zmm29, %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [3,0,0,0,0,0,12,4] +; AVX512-NEXT: vpermt2q %zmm9, %zmm26, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm9 +; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vpermi2q %zmm2, %zmm8, %zmm31 +; AVX512-NEXT: vpermi2q %zmm2, %zmm8, %zmm29 +; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm30 = [1,0,0,0,0,0,10,2] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 +; AVX512-NEXT: movb $48, %sil +; AVX512-NEXT: kmovw %esi, %k3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,5,13,0,0,0] +; AVX512-NEXT: vpermt2q %zmm7, %zmm26, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,6,14] +; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm26 +; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm30 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: movb $12, %sil ; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vinserti64x4 $0, %ymm28, %zmm0, %zmm4 {%k3} -; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm27, %zmm27 +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k3} +; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 ; AVX512-NEXT: movb $112, %sil ; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 {%k4} -; AVX512-NEXT: vmovdqa64 64(%rdx), %xmm27 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] -; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 -; AVX512-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm5 {%k3} -; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm26, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm5 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 {%k4} +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} +; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512-NEXT: movb $120, %sil ; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 {%k3} -; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512-NEXT: movb $6, %sil ; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k4} +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k4} ; AVX512-NEXT: movb $56, %sil ; AVX512-NEXT: kmovw %esi, %k5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 {%k5} -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm8 {%k5} +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512-NEXT: movb $-31, %sil -; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: kmovw %esi, %k2 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,2,3],zmm31[4,5,6,7] ; AVX512-NEXT: movb $-61, %sil -; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} -; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 {%k5} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm31[1],ymm23[3],ymm31[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: kmovw %esi, %k2 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} +; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm30 {%k4} +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm30 {%k5} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm24[1],ymm15[1],ymm24[3],ymm15[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: movb $14, %cl ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 576(%rax) ; AVX512-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 832(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride7_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm4 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm0[4,5,6,7] -; AVX512-FCP-NEXT: movb $64, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm12 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512-FCP-NEXT: movb $24, %sil +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,0,0,0,0,1] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %xmm16 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-FCP-NEXT: movb $12, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,15,0,0] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] -; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm26, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm24 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512-FCP-NEXT: movb $48, %sil +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm22 +; AVX512-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 +; AVX512-FCP-NEXT: movb $112, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,3,11,3,11,3,11,3] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm29 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] -; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm17, %zmm28 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm30 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm15, %zmm20 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm18[0],zmm2[0],zmm18[2],zmm2[2],zmm18[4],zmm2[4],zmm18[6],zmm2[6] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm18 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm31, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %ymm23 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm21 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,3,7,7] -; AVX512-FCP-NEXT: vpermt2q %ymm23, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm31, %zmm14 +; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm14, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,5,13,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm27 ; AVX512-FCP-NEXT: movb $96, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,12,7,0,1,12,7] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm18 -; AVX512-FCP-NEXT: movb $28, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k3} = zmm5[2,3,2,3],zmm12[2,3,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm12 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm29 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm31, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %ymm27 -; AVX512-FCP-NEXT: vpermi2q %ymm23, %ymm27, %ymm15 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm27[0],ymm23[0],ymm27[2],ymm23[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k3} = zmm23[2,3,2,3],zmm12[2,3,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: movb $12, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k3} -; AVX512-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm5 -; AVX512-FCP-NEXT: movb $112, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k4} -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k3} -; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm7, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k2} +; AVX512-FCP-NEXT: kmovw %esi, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm27 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,0,0,0,0,0,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,12,0,0,0,0,7] +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm14 ; AVX512-FCP-NEXT: movb $120, %sil +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,7,15,0] +; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [6,0,0,0,0,13,14,7] +; AVX512-FCP-NEXT: vpermi2q %zmm22, %zmm13, %zmm29 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,7,15,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm30 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,7,15,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm22 +; AVX512-FCP-NEXT: movb $24, %dil +; AVX512-FCP-NEXT: kmovw %edi, %k2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 {%k2} +; AVX512-FCP-NEXT: movb $-31, %dil +; AVX512-FCP-NEXT: kmovw %edi, %k3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 {%k3} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [5,0,0,0,0,0,14,6] +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm15, %zmm29 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,13,0,0,0,0,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm29, %zmm30 +; AVX512-FCP-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] +; AVX512-FCP-NEXT: movb $-61, %dil +; AVX512-FCP-NEXT: kmovw %edi, %k3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} ; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k3} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm14 {%k3} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [3,0,0,0,0,0,12,4] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm21 +; AVX512-FCP-NEXT: movb $48, %sil +; AVX512-FCP-NEXT: vpermi2q %zmm17, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 {%k3} +; AVX512-FCP-NEXT: kmovw %esi, %k3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %ymm26 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,3,7,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512-FCP-NEXT: vpermt2q %ymm26, %ymm6, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: movb $14, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} -; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: movb $6, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k5} -; AVX512-FCP-NEXT: movb $56, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k6} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm26 {%k1} -; AVX512-FCP-NEXT: movb $-31, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm28[4,5,6,7] -; AVX512-FCP-NEXT: movb $-61, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k1 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm27 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512-FCP-NEXT: kmovw %esi, %k3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm8 +; AVX512-FCP-NEXT: vpermi2q %ymm25, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k3} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm23[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,3,11,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [2,10,0,0,0,0,0,3] +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512-FCP-NEXT: movb $28, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm19[2,3,2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,0,0,0,0,0,10,2] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm25[0],ymm8[2],ymm25[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm13[2,3,2,3] +; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: movb $6, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,9,0,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,3,4,9,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512-FCP-NEXT: movb $64, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} +; AVX512-FCP-NEXT: movb $56, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm14 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 {%k3} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4} -; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k5} -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k6} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,7,15,0,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [13,0,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm5, %zmm4 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 768(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride7_vf16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm23 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm15, %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm18, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 -; AVX512DQ-NEXT: movb $48, %sil -; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm17, %zmm18 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,15,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-NEXT: movb $64, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 -; AVX512DQ-NEXT: movb $24, %sil -; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm22, %zmm18 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm18, %zmm21 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] -; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm21, %zmm18 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] -; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm19, %zmm21 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm21, %zmm19 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm31, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm18, %zmm21 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,5,13,0] ; AVX512DQ-NEXT: movb $96, %sil +; AVX512DQ-NEXT: kmovw %esi, %k1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [4,12,0,0,0,0,0,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm24, %zmm11, %zmm18 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,7,15,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 +; AVX512DQ-NEXT: movb $24, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm13, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] -; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm29, %zmm23 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm16 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [9,1,9,1,9,1,9,1] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %ymm23 -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm17 -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm9, %zmm6 -; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm20, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm28, %zmm7 -; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm20, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm31, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm23[0],ymm28[2],ymm23[2] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,2,3,4,15,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm15, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,12,0,0,0,0,7] +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,3,11,0] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm31, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [2,10,0,0,0,0,0,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm17, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,9,0,0,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,7,15,0] +; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,0,0,0,0,0,14,6] +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm27, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,15,0] +; AVX512DQ-NEXT: vpermi2q %zmm24, %zmm26, %zmm25 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [13,0,2,3,4,5,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm24, %zmm23, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm13[0],ymm23[2],ymm13[2] ; AVX512DQ-NEXT: movb $28, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm27, %zmm31 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm21 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm11 {%k2} -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm29[2,3,2,3],zmm27[2,3,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm30, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %xmm27 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [6,0,0,0,0,13,14,7] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm22, %zmm29 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,13,0,0,0,0,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm1, %zmm27 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,3,4,9,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm16, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm26, %zmm24 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [1,0,0,0,0,0,10,2] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [3,0,0,0,0,0,12,4] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm8, %zmm26 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm8, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,8,0,0,0,0,0,1] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 +; AVX512DQ-NEXT: movb $48, %sil +; AVX512DQ-NEXT: kmovw %esi, %k3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,13,0,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm25 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: movb $12, %sil ; AVX512DQ-NEXT: kmovw %esi, %k5 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm4 {%k5} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k5} ; AVX512DQ-NEXT: movb $112, %sil ; AVX512DQ-NEXT: kmovw %esi, %k7 -; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k7} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} +; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} ; AVX512DQ-NEXT: movb $120, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} -; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-NEXT: movb $6, %sil ; AVX512DQ-NEXT: kmovw %esi, %k4 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4} ; AVX512DQ-NEXT: movb $56, %sil ; AVX512DQ-NEXT: kmovw %esi, %k6 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm14 {%k6} -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k5} -; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm16, %zmm5 {%k7} -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm30 {%k5} +; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm18, %zmm30 {%k7} +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512DQ-NEXT: movb $-31, %sil -; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm17[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-NEXT: kmovw %esi, %k2 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,2,3],zmm26[4,5,6,7] ; AVX512DQ-NEXT: movb $-61, %sil -; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} -; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm15 {%k6} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm23[1],ymm28[3],ymm23[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: kmovw %esi, %k2 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm28 {%k4} +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm23[1],ymm13[1],ymm23[3],ymm13[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: movb $14, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm3 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 832(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride7_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: movb $64, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm16 -; AVX512DQ-FCP-NEXT: movb $24, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,15,0,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm16, %zmm31 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %xmm27 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm27 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,8,0,0,0,0,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-FCP-NEXT: movb $12, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm8 {%k3} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 +; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm12 ; AVX512DQ-FCP-NEXT: movb $112, %sil -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm16 -; AVX512DQ-FCP-NEXT: movb $48, %dil -; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3} ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm28, %zmm8 {%k3} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm24 +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,5,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,5,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 ; AVX512DQ-FCP-NEXT: movb $96, %sil -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} -; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 +; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [4,12,0,0,0,0,0,5] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm31, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,12,0,0,0,0,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm19 ; AVX512DQ-FCP-NEXT: movb $120, %sil -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm30 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm26, %zmm4 {%k3} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,7,15,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [6,0,0,0,0,13,14,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm14, %zmm29 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,7,15,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm30 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm26 = [0,7,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 +; AVX512DQ-FCP-NEXT: movb $24, %dil +; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 {%k2} ; AVX512DQ-FCP-NEXT: movb $-31, %dil -; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm18 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 +; AVX512DQ-FCP-NEXT: kmovw %edi, %k3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 {%k3} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [5,0,0,0,0,0,14,6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm15, %zmm29 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,13,0,0,0,0,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm29, %zmm30 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FCP-NEXT: movb $-61, %dil -; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k3} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %ymm20 +; AVX512DQ-FCP-NEXT: kmovw %edi, %k3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} +; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [3,0,0,0,0,0,12,4] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 +; AVX512DQ-FCP-NEXT: movb $48, %sil +; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} +; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %ymm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-FCP-NEXT: vpermt2q %ymm24, %ymm4, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,3,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512DQ-FCP-NEXT: vpermt2q %ymm24, %ymm6, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: movb $14, %sil -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm14, %zmm17 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm10[0],zmm2[0],zmm10[2],zmm2[2],zmm10[4],zmm2[4],zmm10[6],zmm2[6] -; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FCP-NEXT: vpermi2q %ymm20, %ymm11, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm23 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k3} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-FCP-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm27[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,3,11,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,10,0,0,0,0,0,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: movb $28, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm11[0],ymm20[0],ymm11[2],ymm20[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm4[2,3,2,3],zmm9[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm18[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,0,0,0,0,0,10,2] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: movb $6, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm27 -; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,9,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,3,4,9,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: movb $64, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm31, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,7,15,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm31, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [13,0,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -4599,823 +4405,763 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-LABEL: store_i64_stride7_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm18, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm2 -; AVX512BW-NEXT: movb $48, %sil -; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm18 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm16 = [0,7,15,0] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-NEXT: movb $64, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm31 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm22 -; AVX512BW-NEXT: movb $24, %sil -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm22, %zmm18 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm18, %zmm21 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm21, %zmm18 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm19, %zmm21 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm21, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm26, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm3, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,0,0,0,0,1] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 ; AVX512BW-NEXT: movb $96, %sil +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [4,12,0,0,0,0,0,5] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,7,15,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 +; AVX512BW-NEXT: movb $24, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,12,7,0,1,12,7] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [9,1,9,1,9,1,9,1] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %ymm31 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm30, %zmm20, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm30, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm30, %zmm20, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm23, %zmm20 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm23[0],ymm31[0],ymm23[2],ymm31[2] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,2,3,4,15,0,0] +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,12,0,0,0,0,7] +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,3,11,0] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [2,10,0,0,0,0,0,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,9,0,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,7,15,0] +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm14, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,0,0,0,0,0,14,6] +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm15 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,15,0] +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm26, %zmm25 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [13,0,2,3,4,5,6,14] +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm23, %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %ymm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] ; AVX512BW-NEXT: movb $28, %sil ; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm30[2,3,2,3],zmm28[2,3,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm28, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm21 -; AVX512BW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa (%r8), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k3} = zmm29[2,3,2,3],zmm28[2,3,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm28[0],mem[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm28 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [6,0,0,0,0,13,14,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm22, %zmm28 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,13,0,0,0,0,6,7] +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa (%r9), %ymm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 +; AVX512BW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,3,4,9,0,0] +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,5,13,0] +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm26, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm25 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [3,0,0,0,0,0,12,4] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [1,0,0,0,0,0,10,2] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 +; AVX512BW-NEXT: movb $48, %sil +; AVX512BW-NEXT: kmovd %esi, %k3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,5,13,0,0,0] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm26, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,6,14] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm26 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm30 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: movb $12, %sil ; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm0, %zmm4 {%k3} -; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm27, %zmm27 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k3} +; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 ; AVX512BW-NEXT: movb $112, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 {%k4} -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm5 {%k3} -; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm26, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k4} +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} +; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512BW-NEXT: movb $120, %sil ; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k3} -; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: movb $6, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k4} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k4} ; AVX512BW-NEXT: movb $56, %sil ; AVX512BW-NEXT: kmovd %esi, %k5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k5} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 {%k5} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512BW-NEXT: movb $-31, %sil -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm24[4,5,6,7] +; AVX512BW-NEXT: kmovd %esi, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,2,3],zmm31[4,5,6,7] ; AVX512BW-NEXT: movb $-61, %sil -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} -; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k5} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm31[1],ymm23[3],ymm31[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: kmovd %esi, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} +; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm30 {%k4} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm30 {%k5} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm24[1],ymm15[1],ymm24[3],ymm15[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: movb $14, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride7_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm4 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm0[4,5,6,7] -; AVX512BW-FCP-NEXT: movb $64, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: movb $24, %sil +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,0,0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %xmm16 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: movb $12, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,15,0,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] -; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm22 -; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm26, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm24 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 -; AVX512BW-FCP-NEXT: movb $48, %sil +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm22 +; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 +; AVX512BW-FCP-NEXT: movb $112, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,3,11,3,11,3,11,3] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm29 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] -; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm17, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm30 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm15, %zmm20 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm18[0],zmm2[0],zmm18[2],zmm2[2],zmm18[4],zmm2[4],zmm18[6],zmm2[6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm18 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %ymm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,3,7,7] -; AVX512BW-FCP-NEXT: vpermt2q %ymm23, %ymm15, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm31, %zmm14 +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,5,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm27 ; AVX512BW-FCP-NEXT: movb $96, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,12,7,0,1,12,7] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm18 -; AVX512BW-FCP-NEXT: movb $28, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k3} = zmm5[2,3,2,3],zmm12[2,3,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm29 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm31, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %ymm27 -; AVX512BW-FCP-NEXT: vpermi2q %ymm23, %ymm27, %ymm15 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm27[0],ymm23[0],ymm27[2],ymm23[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k3} = zmm23[2,3,2,3],zmm12[2,3,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: movb $12, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm5 -; AVX512BW-FCP-NEXT: movb $112, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k2} +; AVX512BW-FCP-NEXT: kmovd %esi, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,0,0,0,0,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,12,0,0,0,0,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm28, %zmm14 ; AVX512BW-FCP-NEXT: movb $120, %sil +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,7,15,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [6,0,0,0,0,13,14,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm22, %zmm13, %zmm29 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,7,15,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm30 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,7,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm22 +; AVX512BW-FCP-NEXT: movb $24, %dil +; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: movb $-31, %dil +; AVX512BW-FCP-NEXT: kmovd %edi, %k3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [5,0,0,0,0,0,14,6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm15, %zmm29 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,13,0,0,0,0,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm29, %zmm30 +; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-FCP-NEXT: movb $-61, %dil +; AVX512BW-FCP-NEXT: kmovd %edi, %k3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} ; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k3} -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm14 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [3,0,0,0,0,0,12,4] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm21 +; AVX512BW-FCP-NEXT: movb $48, %sil +; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 {%k3} +; AVX512BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm25 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %ymm26 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,3,7,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512BW-FCP-NEXT: vpermt2q %ymm26, %ymm6, %ymm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: movb $14, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} -; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: movb $6, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k5 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k5} -; AVX512BW-FCP-NEXT: movb $56, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k6} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: movb $-31, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-FCP-NEXT: movb $-61, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm27 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm8 +; AVX512BW-FCP-NEXT: vpermi2q %ymm25, %ymm8, %ymm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k3} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm23[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,3,11,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [2,10,0,0,0,0,0,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: movb $28, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm19[2,3,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,0,0,0,0,0,10,2] +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm25[0],ymm8[2],ymm25[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm13[2,3,2,3] +; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: movb $6, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,9,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,3,4,9,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: movb $64, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: movb $56, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 {%k3} -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4} -; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k5} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k6} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,7,15,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [13,0,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 768(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride7_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm18, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: movb $48, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm17, %zmm18 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,7,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $64, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 -; AVX512DQ-BW-NEXT: movb $24, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm18, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm21, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] -; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm19, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm21, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,5,13,0] ; AVX512DQ-BW-NEXT: movb $96, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm11, %zmm18 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 +; AVX512DQ-BW-NEXT: movb $24, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %ymm23 -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm17 -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm20, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm28, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm23[0],ymm28[2],ymm23[2] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm15, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm31, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [2,10,0,0,0,0,0,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,7,15,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm26, %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm23, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm13[0],ymm23[2],ymm13[2] ; AVX512DQ-BW-NEXT: movb $28, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm27, %zmm31 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm29[2,3,2,3],zmm27[2,3,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm30, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm27 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [6,0,0,0,0,13,14,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm22, %zmm29 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm26, %zmm24 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [1,0,0,0,0,0,10,2] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [3,0,0,0,0,0,12,4] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm26 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,8,0,0,0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 +; AVX512DQ-BW-NEXT: movb $48, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,5,13,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: movb $12, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm4 {%k5} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k5} ; AVX512DQ-BW-NEXT: movb $112, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k7 -; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k7} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: movb $120, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} -; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $6, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4} ; AVX512DQ-BW-NEXT: movb $56, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k6} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k5} -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm16, %zmm5 {%k7} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm30 {%k5} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm18, %zmm30 {%k7} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: movb $-31, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm17[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-BW-NEXT: kmovd %esi, %k2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,2,3],zmm26[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $-61, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} -; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k6} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm23[1],ymm28[3],ymm23[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: kmovd %esi, %k2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm28 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm23[1],ymm13[1],ymm23[3],ymm13[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: movb $14, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 832(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 768(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride7_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: movb $64, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm16 -; AVX512DQ-BW-FCP-NEXT: movb $24, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm16, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %xmm27 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,8,0,0,0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-BW-FCP-NEXT: movb $12, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm8 {%k3} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movb $112, %sil -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm16 -; AVX512DQ-BW-FCP-NEXT: movb $48, %dil -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm28, %zmm8 {%k3} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,5,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,5,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 ; AVX512DQ-BW-FCP-NEXT: movb $96, %sil -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm31, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm28, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movb $120, %sil -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7] -; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm26, %zmm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [6,0,0,0,0,13,14,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm14, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm26 = [0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 +; AVX512DQ-BW-FCP-NEXT: movb $24, %dil +; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $-31, %dil -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 +; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm15, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm29, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: movb $-61, %dil -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm20 +; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [3,0,0,0,0,0,12,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 +; AVX512DQ-BW-FCP-NEXT: movb $48, %sil +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %ymm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm24, %ymm4, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [1,3,7,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm24, %ymm6, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: movb $14, %sil -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm14, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm10[0],zmm2[0],zmm10[2],zmm2[2],zmm10[4],zmm2[4],zmm10[6],zmm2[6] -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm20, %ymm11, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm27[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,10,0,0,0,0,0,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $28, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm11[0],ymm20[0],ymm11[2],ymm20[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm4[2,3,2,3],zmm9[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm18[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,0,0,0,0,0,10,2] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: movb $6, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: movb $64, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm31, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm31, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -7994,3489 +7740,3335 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2184, %rsp # imm = 0x888 +; AVX512-NEXT: subq $1928, %rsp # imm = 0x788 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm23 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm29 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm23 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm16 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm27 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512-NEXT: movb $96, %r10b ; AVX512-NEXT: kmovw %r10d, %k1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,10,0,3,2,10,0,3] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa (%r9), %ymm10 -; AVX512-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX512-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,9,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,3,4,9,0,0] +; AVX512-NEXT: vpermt2q %zmm3, %zmm15, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,3,11,0] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,10,0,0,0,0,0,3] +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa (%r9), %ymm8 +; AVX512-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%r9), %ymm12 +; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%r8), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%r8), %ymm13 ; AVX512-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX512-NEXT: movb $28, %r10b ; AVX512-NEXT: kmovw %r10d, %k2 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,0,0,0,0,0,5] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,0,0,0,0,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [5,0,0,0,0,0,14,6] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] -; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm20, %zmm21 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm17 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,0,0,0,0,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,7,15,0] +; AVX512-NEXT: vpermt2q %zmm4, %zmm3, %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [6,0,0,0,0,13,14,7] +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512-NEXT: vpermt2q %zmm7, %zmm22, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-NEXT: vpermt2q %zmm18, %zmm14, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-NEXT: vpermt2q %zmm18, %zmm8, %zmm13 +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm13 +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm19, %zmm0 -; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm18, %zmm3, %zmm1 +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2q %zmm22, %zmm15, %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm0 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512-NEXT: vpermt2q %zmm18, %zmm17, %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm21, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512-NEXT: vmovdqa 128(%r9), %ymm13 -; AVX512-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm30[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512-NEXT: vmovdqa64 128(%r9), %ymm17 +; AVX512-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 128(%r8), %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm5[0],ymm17[0],ymm5[2],ymm17[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm5[2,3,2,3],zmm29[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%r8), %zmm5 -; AVX512-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm10 -; AVX512-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 -; AVX512-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512-NEXT: vpermi2q %zmm19, %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm5, %zmm19, %zmm20 +; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm20 ; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm5 -; AVX512-NEXT: vpermt2q %zmm5, %zmm3, %zmm30 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-NEXT: vpermt2q %zmm29, %zmm12, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-NEXT: vpermt2q %zmm29, %zmm19, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2q %zmm28, %zmm19, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm26 +; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [3,0,0,0,0,0,12,4] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,13,0] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512-NEXT: vpermt2q %zmm2, %zmm20, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm2, %zmm26, %zmm22 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,7,15,0,0,0] +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512-NEXT: vpermt2q %zmm27, %zmm20, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512-NEXT: vpermt2q %zmm27, %zmm25, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm22 +; AVX512-NEXT: vpermi2q %zmm28, %zmm1, %zmm20 +; AVX512-NEXT: vpermi2q %zmm28, %zmm1, %zmm25 +; AVX512-NEXT: vpermt2q %zmm1, %zmm0, %zmm28 ; AVX512-NEXT: movb $48, %r10b ; AVX512-NEXT: kmovw %r10d, %k3 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm7[0],zmm2[0],zmm7[2],zmm2[2],zmm7[4],zmm2[4],zmm7[6],zmm2[6] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512-NEXT: vpermt2q %zmm16, %zmm13, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512-NEXT: vpermt2q %zmm22, %zmm13, %zmm20 -; AVX512-NEXT: vpermi2q %zmm2, %zmm7, %zmm13 -; AVX512-NEXT: vpermi2q %zmm7, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm25 -; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm16 -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm14[0],zmm8[0],zmm14[2],zmm8[2],zmm14[4],zmm8[4],zmm14[6],zmm8[6] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm17 -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512-NEXT: vpermi2q %zmm22, %zmm18, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,0,0,0,0,0,10,2] +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512-NEXT: vpermt2q %zmm9, %zmm4, %zmm16 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm9[0],zmm23[0],zmm9[2],zmm23[2],zmm9[4],zmm23[4],zmm9[6],zmm23[6] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,0,0,0,1] +; AVX512-NEXT: vpermt2q %zmm23, %zmm5, %zmm17 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,5,13,0,0,0] +; AVX512-NEXT: vpermt2q %zmm23, %zmm1, %zmm21 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,6,14] +; AVX512-NEXT: vpermt2q %zmm23, %zmm2, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,7,15,0] +; AVX512-NEXT: vpermt2q %zmm23, %zmm8, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512-NEXT: vpermt2q %zmm10, %zmm4, %zmm31 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm10[0],zmm11[0],zmm10[2],zmm11[2],zmm10[4],zmm11[4],zmm10[6],zmm11[6] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 -; AVX512-NEXT: vpermi2q %zmm22, %zmm18, %zmm12 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512-NEXT: vpermt2q %zmm22, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm23 +; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm9 +; AVX512-NEXT: vpermt2q %zmm11, %zmm8, %zmm7 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 ; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm24, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [12,0,0,3,4,5,6,13] -; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm25 -; AVX512-NEXT: movb $24, %sil -; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm6 {%k3} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 -; AVX512-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512-NEXT: vpermt2q %zmm15, %zmm4, %zmm13 +; AVX512-NEXT: vpermi2q %zmm18, %zmm15, %zmm1 +; AVX512-NEXT: vpermi2q %zmm18, %zmm15, %zmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm15[0],zmm18[0],zmm15[2],zmm18[2],zmm15[4],zmm18[4],zmm15[6],zmm18[6] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512-NEXT: vpermt2q %zmm18, %zmm5, %zmm15 +; AVX512-NEXT: vpermt2q %zmm18, %zmm8, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm0, %zmm11, %zmm5 +; AVX512-NEXT: vpermi2q %zmm11, %zmm0, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,3,11,0] +; AVX512-NEXT: vpermi2q %zmm0, %zmm11, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm11 +; AVX512-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: movb $12, %sil +; AVX512-NEXT: kmovw %esi, %k3 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm8, %zmm8 +; AVX512-NEXT: movb $112, %sil ; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k4} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm22 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 {%k4} ; AVX512-NEXT: vmovdqa 64(%rdx), %xmm8 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4} +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k3} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm8, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm23 {%k4} ; AVX512-NEXT: vmovdqa 128(%rdx), %xmm8 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k4} +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm15 {%k3} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm8 +; AVX512-NEXT: vinserti32x4 $3, 128(%rax), %zmm8, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 {%k4} ; AVX512-NEXT: vmovdqa 192(%rdx), %xmm8 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k4} -; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm8 -; AVX512-NEXT: movb $112, %sil -; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm27 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm23 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} -; AVX512-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vinserti32x4 $3, 192(%rax), %zmm6, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k3} +; AVX512-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512-NEXT: vmovdqa64 192(%r9), %zmm19 +; AVX512-NEXT: vpermi2q %zmm19, %zmm8, %zmm0 +; AVX512-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512-NEXT: movb $120, %sil -; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm24 {%k4} -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm22 # 64-byte Folded Reload -; AVX512-NEXT: # zmm22 = zmm16[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm16 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k4} +; AVX512-NEXT: kmovw %esi, %k3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 {%k3} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm21 # 64-byte Folded Reload +; AVX512-NEXT: # zmm21 = zmm12[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: movb $-61, %sil ; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 {%k4} -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 {%k4} -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} -; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: movb $24, %sil +; AVX512-NEXT: kmovw %esi, %k3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm25[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512-NEXT: movb $-31, %sil +; AVX512-NEXT: kmovw %esi, %k4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 {%k4} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k3} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k4} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm6 {%k4} +; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: movb $6, %sil ; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm18 {%k3} -; AVX512-NEXT: movb $-31, %sil -; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm18 {%k3} -; AVX512-NEXT: movb $56, %sil -; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} -; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm21 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 {%k3} -; AVX512-NEXT: vpbroadcastq 136(%rcx), %ymm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm19 {%k3} -; AVX512-NEXT: vpbroadcastq 200(%rcx), %ymm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4} -; AVX512-NEXT: movb $64, %cl +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm16 {%k4} +; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k4} +; AVX512-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm13 {%k4} +; AVX512-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512-NEXT: movb $56, %cl ; AVX512-NEXT: kmovw %ecx, %k4 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqa64 192(%rax), %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,11,0,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa 192(%r8), %ymm7 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm7[2,3,2,3],zmm6[2,3,2,3] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 {%k4} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k4} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 {%k4} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,9,0,0,0] +; AVX512-NEXT: vpermi2q %zmm19, %zmm8, %zmm1 +; AVX512-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,3,4,9,0,0] +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k4} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,11,0,0,4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm8, %zmm1, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm19, %zmm18, %zmm1 +; AVX512-NEXT: vmovdqa64 192(%r8), %ymm18 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm18[0],mem[0],ymm18[2],mem[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm18[2,3,2,3],zmm2[2,3,2,3] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm11 {%k3} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm18 # 32-byte Folded Reload +; AVX512-NEXT: # ymm18 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm18[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: movb $14, %cl ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm12 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm29 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm27 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm22 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512-NEXT: movb $64, %cl +; AVX512-NEXT: kmovw %ecx, %k2 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 {%k2} ; AVX512-NEXT: movb $8, %cl ; AVX512-NEXT: kmovw %ecx, %k2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm1, %zmm25, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] -; AVX512-NEXT: vpermi2q %zmm1, %zmm31, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] -; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm6, %zmm8, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] +; AVX512-NEXT: vpermi2q %zmm8, %zmm10, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,15,0,0] +; AVX512-NEXT: vpermi2q %zmm8, %zmm11, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,12,0,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm19, %zmm12, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [13,0,2,3,4,5,6,14] +; AVX512-NEXT: vpermi2q %zmm19, %zmm3, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] +; AVX512-NEXT: vpermi2q %zmm19, %zmm10, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,12,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm8, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm11, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm11 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm10, 1472(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 1408(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 1280(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 1216(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 1024(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 960(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 832(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 1344(%rax) -; AVX512-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 896(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 1728(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 1536(%rax) -; AVX512-NEXT: addq $2184, %rsp # imm = 0x888 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; +; AVX512-NEXT: vmovdqa64 %zmm20, 1472(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 960(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 832(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 704(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm31, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 1344(%rax) +; AVX512-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 896(%rax) +; AVX512-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 1664(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 1600(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 1536(%rax) +; AVX512-NEXT: addq $1928, %rsp # imm = 0x788 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; ; AVX512-FCP-LABEL: store_i64_stride7_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512-FCP-NEXT: subq $1896, %rsp # imm = 0x768 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512-FCP-NEXT: movb $96, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,1,9,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,3,4,9,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,3,11,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,10,0,0,0,0,0,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512-FCP-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm10 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 128(%r8), %ymm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 128(%r9), %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %ymm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm2[0],ymm18[2],ymm2[2] ; AVX512-FCP-NEXT: movb $28, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm5[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] -; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 -; AVX512-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm5, %ymm18 +; AVX512-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [4,12,0,0,0,0,0,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,12,0,0,0,0,7] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX512-FCP-NEXT: vpermt2q %ymm6, %ymm5, %ymm12 -; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm11 -; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm11 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [5,0,0,0,0,0,14,6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,0,0,0,0,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,7,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [6,0,0,0,0,13,14,7] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] +; AVX512-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm22 +; AVX512-FCP-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm1[0],ymm21[2],ymm1[2] +; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm21 +; AVX512-FCP-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm24 -; AVX512-FCP-NEXT: movb $48, %r10b -; AVX512-FCP-NEXT: kmovw %r10d, %k3 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm3[0],zmm12[2],zmm3[2],zmm12[4],zmm3[4],zmm12[6],zmm3[6] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm24 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm24[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm7, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm20 -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm12 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm13, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [3,0,0,0,0,0,12,4] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm29, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,5,13,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,15,0,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm15, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm27, %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm27, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,11,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,11,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [12,0,0,3,4,5,6,13] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: movb $24, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 {%k3} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,15,0,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm0, %zmm29 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm26, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm26, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm22, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm22 +; AVX512-FCP-NEXT: movb $48, %r10b +; AVX512-FCP-NEXT: kmovw %r10d, %k3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,0,0,0,0,1] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [1,0,0,0,0,0,10,2] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm18 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm11[0],zmm16[0],zmm11[2],zmm16[2],zmm11[4],zmm16[4],zmm11[6],zmm16[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,5,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,7,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm8[0],zmm25[2],zmm8[2],zmm25[4],zmm8[4],zmm25[6],zmm8[6] +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm14[0],zmm0[0],zmm14[2],zmm0[2],zmm14[4],zmm0[4],zmm14[6],zmm0[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm14, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm14, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: movb $12, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} -; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} -; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} -; AVX512-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm9, %zmm5 +; AVX512-FCP-NEXT: kmovw %esi, %k3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k3} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm8, %zmm8 ; AVX512-FCP-NEXT: movb $112, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm5, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm5, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k4} +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm16 {%k3} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm8, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 {%k4} +; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k3} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm8, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k4} +; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm6 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k4} -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: movb $14, %sil +; AVX512-FCP-NEXT: kmovw %esi, %k3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k3} +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512-FCP-NEXT: movb $120, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = zmm15[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k4} -; AVX512-FCP-NEXT: movb $120, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k4} -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = zmm22[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k4} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} ; AVX512-FCP-NEXT: movb $-61, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k4 +; AVX512-FCP-NEXT: kmovw %esi, %k5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm29[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k4} +; AVX512-FCP-NEXT: movb $24, %sil +; AVX512-FCP-NEXT: kmovw %esi, %k3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = zmm17[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm28[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} -; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k5} +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm21[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k5} +; AVX512-FCP-NEXT: movb $-31, %sil +; AVX512-FCP-NEXT: kmovw %esi, %k4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm23 {%k4} +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 {%k4} +; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: movb $6, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} -; AVX512-FCP-NEXT: movb $-31, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} -; AVX512-FCP-NEXT: movb $56, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4} +; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k4} +; AVX512-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm7 {%k4} +; AVX512-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k4} +; AVX512-FCP-NEXT: movb $56, %cl +; AVX512-FCP-NEXT: kmovw %ecx, %k4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k4} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,1,9,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm31, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,3,4,9,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k4} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} -; AVX512-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k4} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} -; AVX512-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} -; AVX512-FCP-NEXT: movb $64, %cl -; AVX512-FCP-NEXT: kmovw %ecx, %k4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k4} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,11,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,11,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vmovdqa 192(%r8), %ymm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k2} = zmm8[2,3,2,3],zmm2[2,3,2,3] +; AVX512-FCP-NEXT: movb $64, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} ; AVX512-FCP-NEXT: movb $8, %al -; AVX512-FCP-NEXT: kmovw %eax, %k4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%r8), %ymm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,3,4,5,6,14] -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm14 {%k3} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] +; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm12, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,3,4,15,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,12,0,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm8, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,12,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 1408(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1408(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 1280(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 1152(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1088(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm1, 1024(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 960(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 832(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 640(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 1344(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 896(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 1600(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512-FCP-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1344(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 1728(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1664(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1600(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 1536(%rax) +; AVX512-FCP-NEXT: addq $1896, %rsp # imm = 0x768 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $2184, %rsp # imm = 0x888 +; AVX512DQ-NEXT: subq $1928, %rsp # imm = 0x788 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm7 ; AVX512DQ-NEXT: movb $96, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm25, %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,1,9,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,3,4,9,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,11,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,10,0,0,0,0,0,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm18, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm8 ; AVX512DQ-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm14 -; AVX512DQ-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm11 ; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX512DQ-NEXT: movb $28, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm19 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm9[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,0,0,0,0,0,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,0,0,0,0,7] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [5,0,0,0,0,0,14,6] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,0,0,0,0,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,7,15,0] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,0,0,0,0,13,14,7] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm16, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm18, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm13, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm8, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm8, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm25, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm15, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm15 -; AVX512DQ-NEXT: vmovdqa 128(%r9), %ymm14 -; AVX512DQ-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 128(%r8), %ymm5 -; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[2,3,2,3],zmm15[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm16, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm18, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512DQ-NEXT: vmovdqa 128(%r9), %ymm11 +; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 128(%r8), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm4[2,3,2,3],zmm29[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm7 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm7, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm3, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,12,4,3,0,12,4] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm3, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm1, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm6, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm9, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm3, %zmm22 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [3,0,0,0,0,0,12,4] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,13,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,7,15,0,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm9, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm21, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm23 +; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm14, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm14, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 ; AVX512DQ-NEXT: movb $48, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k3 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm14, %zmm28 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm13, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm14, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm13, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm13, %zmm18 -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm8, %zmm13 -; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm17, %zmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,0,0,0,0,0,10,2] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm4, %zmm14 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm15[0],zmm17[0],zmm15[2],zmm17[2],zmm15[4],zmm17[4],zmm15[6],zmm17[6] ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm6, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm11, %zmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm19[0],zmm11[2],zmm19[2],zmm11[4],zmm19[4],zmm11[6],zmm19[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm15, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm22, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [12,0,0,3,4,5,6,13] -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm3, %zmm22 -; AVX512DQ-NEXT: movb $24, %sil -; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 {%k3} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,15,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm4, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,0,0,0,0,1] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,5,13,0,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,6,14] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,15,0] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm7, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm4, %zmm17 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm10[0],zmm24[0],zmm10[2],zmm24[2],zmm10[4],zmm24[4],zmm10[6],zmm24[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm1, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm7, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm4, %zmm24 +; AVX512DQ-NEXT: vpermi2q %zmm31, %zmm27, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm31, %zmm27, %zmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm3, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm7, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,3,11,0] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm15, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm12 {%k1} ; AVX512DQ-NEXT: movb $120, %sil -; AVX512DQ-NEXT: kmovw %esi, %k4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 {%k4} -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm1 {%k1} +; AVX512DQ-NEXT: kmovw %esi, %k3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm25 {%k3} +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm5[0,1,2,3],zmm26[4,5,6,7] ; AVX512DQ-NEXT: movb $-61, %sil ; AVX512DQ-NEXT: kmovw %esi, %k4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm4 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k4} +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm26 {%k3} +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 {%k3} +; AVX512DQ-NEXT: movb $24, %sil +; AVX512DQ-NEXT: kmovw %esi, %k3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm16[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512DQ-NEXT: movb $-31, %sil +; AVX512DQ-NEXT: kmovw %esi, %k4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k4} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 {%k3} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 {%k4} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k3} +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm20 {%k4} +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: movb $12, %sil ; AVX512DQ-NEXT: kmovw %esi, %k4 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} -; AVX512DQ-NEXT: movb $-31, %sil -; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} -; AVX512DQ-NEXT: movb $112, %sil -; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm27 {%k3} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm13 {%k4} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k4} ; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm2 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k4} ; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm2 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: movb $6, %sil +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} +; AVX512DQ-NEXT: movb $112, %sil ; AVX512DQ-NEXT: kmovw %esi, %k4 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k4} +; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm13 {%k4} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3} +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm10 {%k4} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm18 {%k3} -; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm19, %zmm0 -; AVX512DQ-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm13 {%k3} -; AVX512DQ-NEXT: movb $56, %sil -; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} -; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k4} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} -; AVX512DQ-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k3} -; AVX512DQ-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4} -; AVX512DQ-NEXT: movb $64, %cl +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm27 {%k4} +; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm2, %zmm1 +; AVX512DQ-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm3 {%k4} +; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: movb $6, %sil +; AVX512DQ-NEXT: kmovw %esi, %k4 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} +; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k4} +; AVX512DQ-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} +; AVX512DQ-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQ-NEXT: movb $56, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k4 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm24 {%k4} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%r8), %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 {%k4} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,9,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,3,4,9,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm16 {%k1} +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm6 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm6 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,11,0,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,11,0,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa 192(%r8), %ymm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm15 {%k3} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: movb $14, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm21 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-NEXT: movb $64, %cl +; AVX512DQ-NEXT: kmovw %ecx, %k2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2} ; AVX512DQ-NEXT: movb $8, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,12,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm22, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm30 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [12,0,0,3,4,5,6,13] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm11, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,15,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm15, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm7, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm6, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm11, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,12,3,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm6, %zmm7 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1472(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1344(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1280(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1152(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1088(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1280(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 1152(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 832(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 960(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 896(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 640(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1728(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1536(%rax) -; AVX512DQ-NEXT: addq $2184, %rsp # imm = 0x888 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1728(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1600(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1536(%rax) +; AVX512DQ-NEXT: addq $1928, %rsp # imm = 0x788 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $2088, %rsp # imm = 0x828 +; AVX512DQ-FCP-NEXT: subq $1864, %rsp # imm = 0x748 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm13 ; AVX512DQ-FCP-NEXT: movb $96, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,9,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,3,4,9,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,3,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,10,0,0,0,0,0,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%r8), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm1[0],ymm17[2],ymm1[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%r9), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %ymm21 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm16[0],ymm3[0],ymm16[2],ymm3[2] ; AVX512DQ-FCP-NEXT: movb $28, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm5, %ymm16 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,3,7,7] +; AVX512DQ-FCP-NEXT: vpermt2q %ymm3, %ymm1, %ymm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm7 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,0,0,0,0,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,12,0,0,0,0,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [5,0,0,0,0,0,14,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,0,0,0,0,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,0,7,15,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [6,0,0,0,0,13,14,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vpermt2q %ymm4, %ymm1, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm21[0],ymm0[0],ymm21[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm1, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm2[2,3,2,3],zmm25[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm17, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm21 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm5[2,3,2,3],zmm21[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [3,0,0,0,0,0,12,4] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,5,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,15,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm0, %zmm30 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm27, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm27, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm23, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm23 ; AVX512DQ-FCP-NEXT: movb $48, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm18[0],zmm7[0],zmm18[2],zmm7[2],zmm18[4],zmm7[4],zmm18[6],zmm7[6] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,0,0,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,0,0,0,0,0,10,2] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm20 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm9[0],zmm19[0],zmm9[2],zmm19[2],zmm9[4],zmm19[4],zmm9[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,5,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,7,15,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm12[0],zmm24[0],zmm12[2],zmm24[2],zmm12[4],zmm24[4],zmm12[6],zmm24[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,3,11,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: movb $14, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: movb $120, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} +; AVX512DQ-FCP-NEXT: movb $-61, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm18, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm7, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm8[0],zmm26[0],zmm8[2],zmm26[2],zmm8[4],zmm26[4],zmm8[6],zmm26[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm9[0],zmm24[0],zmm9[2],zmm24[2],zmm9[4],zmm24[4],zmm9[6],zmm24[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm19, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm29, %zmm23, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm13 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm10 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,11,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,11,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm17, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: movb $24, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,15,0,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: movb $14, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm26 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k4} -; AVX512DQ-FCP-NEXT: movb $120, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k4} -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} -; AVX512DQ-FCP-NEXT: movb $-61, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm16[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k4} -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm5[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k5} +; AVX512DQ-FCP-NEXT: movb $-31, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm15 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: movb $12, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k4} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k4} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 {%k3} -; AVX512DQ-FCP-NEXT: movb $-31, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k3} -; AVX512DQ-FCP-NEXT: movb $112, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm12 {%k3} +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k4} -; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: movb $6, %sil +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512DQ-FCP-NEXT: movb $112, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm5, %zmm22 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm14 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm19 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3} -; AVX512DQ-FCP-NEXT: movb $56, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm11 {%k4} +; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: movb $6, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm20 {%k4} ; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k4} ; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm9 {%k4} ; AVX512DQ-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4} -; AVX512DQ-FCP-NEXT: movb $64, %cl +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k4} +; AVX512DQ-FCP-NEXT: movb $56, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k4} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,9,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,3,4,9,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,11,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm10, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%r8), %ymm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-FCP-NEXT: movb $64, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: movb $8, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,12,0,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [13,0,2,3,4,5,6,14] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [12,0,0,3,4,5,6,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm18, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,15,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,12,0,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [13,0,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm6, %zmm10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1472(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 1408(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1344(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 1344(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 1280(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 1216(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1600(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 1536(%rax) -; AVX512DQ-FCP-NEXT: addq $2088, %rsp # imm = 0x828 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1664(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 1600(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQ-FCP-NEXT: addq $1864, %rsp # imm = 0x748 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2184, %rsp # imm = 0x888 +; AVX512BW-NEXT: subq $1928, %rsp # imm = 0x788 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm29 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512BW-NEXT: movb $96, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,10,0,3,2,10,0,3] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512BW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,9,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,3,4,9,0,0] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,3,11,0] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,10,0,0,0,0,0,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512BW-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm12 +; AVX512BW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 64(%r8), %ymm13 ; AVX512BW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512BW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX512BW-NEXT: movb $28, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,0,0,0,0,0,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,0,0,0,0,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [5,0,0,0,0,0,14,6] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm17 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,0,0,0,0,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,7,15,0] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [6,0,0,0,0,13,14,7] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm22, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm14, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512BW-NEXT: vmovdqa 128(%r9), %ymm13 -; AVX512BW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm30[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %ymm17 +; AVX512BW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 128(%r8), %ymm5 +; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm5[0],ymm17[0],ymm5[2],ymm17[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm5[2,3,2,3],zmm29[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm19, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm20 ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm26 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [3,0,0,0,0,0,12,4] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,13,0] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm26, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,7,15,0,0,0] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm1, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm28 ; AVX512BW-NEXT: movb $48, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k3 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm7[0],zmm2[0],zmm7[2],zmm2[2],zmm7[4],zmm2[4],zmm7[6],zmm2[6] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm13, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm14[0],zmm8[0],zmm14[2],zmm8[2],zmm14[4],zmm8[4],zmm14[6],zmm8[6] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm22, %zmm18, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,0,0,0,0,0,10,2] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm9[0],zmm23[0],zmm9[2],zmm23[2],zmm9[4],zmm23[4],zmm9[6],zmm23[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,0,0,0,1] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm17 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,5,13,0,0,0] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm21 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,6,14] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,7,15,0] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm8, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm31 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm10[0],zmm11[0],zmm10[2],zmm11[2],zmm10[4],zmm11[4],zmm10[6],zmm11[6] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm22, %zmm18, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm24, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [12,0,0,3,4,5,6,13] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm25 -; AVX512BW-NEXT: movb $24, %sil -; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k3} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm15, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm15[0],zmm18[0],zmm15[2],zmm18[2],zmm15[4],zmm18[4],zmm15[6],zmm18[6] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm11, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm0, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,3,11,0] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm11, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-NEXT: movb $12, %sil +; AVX512BW-NEXT: kmovd %esi, %k3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm8, %zmm8 +; AVX512BW-NEXT: movb $112, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k4} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm22 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 {%k4} ; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm8 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm8, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 {%k4} ; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm8 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k4} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm8 +; AVX512BW-NEXT: vinserti32x4 $3, 128(%rax), %zmm8, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 {%k4} ; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm8 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k4} -; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm8 -; AVX512BW-NEXT: movb $112, %sil -; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vinserti32x4 $3, 192(%rax), %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm8, %zmm0 +; AVX512BW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: movb $120, %sil -; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k4} -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm22 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm22 = zmm16[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k4} +; AVX512BW-NEXT: kmovd %esi, %k3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm21 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm21 = zmm12[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: movb $-61, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k4} -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k4} -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} -; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: movb $24, %sil +; AVX512BW-NEXT: kmovd %esi, %k3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm25[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512BW-NEXT: movb $-31, %sil +; AVX512BW-NEXT: kmovd %esi, %k4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k4} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k4} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm6 {%k4} +; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: movb $6, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k3} -; AVX512BW-NEXT: movb $-31, %sil -; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k3} -; AVX512BW-NEXT: movb $56, %sil -; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} -; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm21 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 {%k3} -; AVX512BW-NEXT: vpbroadcastq 136(%rcx), %ymm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k3} -; AVX512BW-NEXT: vpbroadcastq 200(%rcx), %ymm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4} -; AVX512BW-NEXT: movb $64, %cl +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm16 {%k4} +; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k4} +; AVX512BW-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm13 {%k4} +; AVX512BW-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,11,0,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%r8), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm7[2,3,2,3],zmm6[2,3,2,3] -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k4} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k4} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k4} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,9,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,3,4,9,0,0] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k4} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,11,0,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm1, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm18[0],mem[0],ymm18[2],mem[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm18[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm18 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm18 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm12 = ymm18[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: movb $14, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-NEXT: movb $64, %cl +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k2} ; AVX512BW-NEXT: movb $8, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm25, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm8, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm10, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,15,0,0] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm11, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,12,0,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm12, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [13,0,2,3,4,5,6,14] +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm3, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm10, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,12,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm11, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm11 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1344(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 1536(%rax) -; AVX512BW-NEXT: addq $2184, %rsp # imm = 0x888 +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1536(%rax) +; AVX512BW-NEXT: addq $1928, %rsp # imm = 0x788 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride7_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512BW-FCP-NEXT: subq $1896, %rsp # imm = 0x768 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512BW-FCP-NEXT: movb $96, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,1,9,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,3,4,9,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,3,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,10,0,0,0,0,0,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512BW-FCP-NEXT: vmovdqa 128(%r8), %ymm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa 128(%r9), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %ymm18 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm22 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %ymm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm2[0],ymm18[2],ymm2[2] ; AVX512BW-FCP-NEXT: movb $28, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] -; AVX512BW-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %ymm2, %ymm5, %ymm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [4,12,0,0,0,0,0,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,12,0,0,0,0,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX512BW-FCP-NEXT: vpermt2q %ymm6, %ymm5, %ymm12 -; AVX512BW-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512BW-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm11 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [5,0,0,0,0,0,14,6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,0,0,0,0,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,7,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [6,0,0,0,0,13,14,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] +; AVX512BW-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm22 +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm1[0],ymm21[2],ymm1[2] +; AVX512BW-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm24 -; AVX512BW-FCP-NEXT: movb $48, %r10b -; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm3[0],zmm12[2],zmm3[2],zmm12[4],zmm3[4],zmm12[6],zmm3[6] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm24 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm24[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm7, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm12 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [3,0,0,0,0,0,12,4] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,0,0,5,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,15,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm15, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,11,0,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,11,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [12,0,0,3,4,5,6,13] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: movb $24, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,15,0,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm0, %zmm29 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm26, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm26, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm22, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm22 +; AVX512BW-FCP-NEXT: movb $48, %r10b +; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,0,0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [1,0,0,0,0,0,10,2] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm18 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm11[0],zmm16[0],zmm11[2],zmm16[2],zmm11[4],zmm16[4],zmm11[6],zmm16[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,5,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,7,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm8[0],zmm25[2],zmm8[2],zmm25[4],zmm8[4],zmm25[6],zmm8[6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm14[0],zmm0[0],zmm14[2],zmm0[2],zmm14[4],zmm0[4],zmm14[6],zmm0[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm14, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: movb $12, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} -; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm8, %zmm8 ; AVX512BW-FCP-NEXT: movb $112, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm16 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm8, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm8, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm6 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: movb $14, %sil +; AVX512BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: movb $120, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm4 = zmm15[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k4} -; AVX512BW-FCP-NEXT: movb $120, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k4} -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm5 = zmm22[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} ; AVX512BW-FCP-NEXT: movb $-61, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k4 +; AVX512BW-FCP-NEXT: kmovd %esi, %k5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k4} +; AVX512BW-FCP-NEXT: movb $24, %sil +; AVX512BW-FCP-NEXT: kmovd %esi, %k3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm29[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm0 = zmm17[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} -; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: movb $6, %sil +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k5} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k5} +; AVX512BW-FCP-NEXT: movb $-31, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} -; AVX512BW-FCP-NEXT: movb $-31, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} -; AVX512BW-FCP-NEXT: movb $56, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm23 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 {%k4} +; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: movb $6, %sil +; AVX512BW-FCP-NEXT: kmovd %esi, %k4 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4} +; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k4} +; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm7 {%k4} +; AVX512BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k4} +; AVX512BW-FCP-NEXT: movb $56, %cl +; AVX512BW-FCP-NEXT: kmovd %ecx, %k4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k4} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,1,9,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,3,4,9,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k4} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} -; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} -; AVX512BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} -; AVX512BW-FCP-NEXT: movb $64, %cl -; AVX512BW-FCP-NEXT: kmovd %ecx, %k4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,11,0,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,11,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa 192(%r8), %ymm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k2} = zmm8[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-FCP-NEXT: movb $64, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: movb $8, %al -; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%r8), %ymm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,3,4,5,6,14] -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm14 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,3,4,15,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,12,0,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,12,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 1408(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 1280(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 1152(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1088(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1024(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 960(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 832(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 640(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 512(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 1344(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 896(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 1600(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512BW-FCP-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1344(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 1728(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1664(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1600(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 1536(%rax) +; AVX512BW-FCP-NEXT: addq $1896, %rsp # imm = 0x768 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride7_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $2184, %rsp # imm = 0x888 +; AVX512DQ-BW-NEXT: subq $1928, %rsp # imm = 0x788 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm7 ; AVX512DQ-BW-NEXT: movb $96, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,10,0,0,0,0,0,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm8 ; AVX512DQ-BW-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm14 -; AVX512DQ-BW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm11 ; AVX512DQ-BW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512DQ-BW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX512DQ-BW-NEXT: movb $28, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm9[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,7,15,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,0,0,0,0,13,14,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa 128(%r9), %ymm14 -; AVX512DQ-BW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 128(%r8), %ymm5 -; AVX512DQ-BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[2,3,2,3],zmm15[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa 128(%r9), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 128(%r8), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm4[2,3,2,3],zmm29[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm7 +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm7, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [3,0,0,0,0,0,12,4] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,5,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm14, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm14, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 ; AVX512DQ-BW-NEXT: movb $48, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm28 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm18 -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm17, %zmm4 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,0,0,0,0,0,10,2] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm15[0],zmm17[0],zmm15[2],zmm17[2],zmm15[4],zmm17[4],zmm15[6],zmm17[6] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm11, %zmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm19[0],zmm11[2],zmm19[2],zmm11[4],zmm19[4],zmm11[6],zmm19[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm22, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [12,0,0,3,4,5,6,13] -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm3, %zmm22 -; AVX512DQ-BW-NEXT: movb $24, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,0,0,0,0,1] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,5,13,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,15,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm17 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm10[0],zmm24[0],zmm10[2],zmm24[2],zmm10[4],zmm24[4],zmm10[6],zmm24[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm7, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm24 +; AVX512DQ-BW-NEXT: vpermi2q %zmm31, %zmm27, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm31, %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm15, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: movb $120, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 {%k4} -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm25 {%k3} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm5[0,1,2,3],zmm26[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $-61, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm4 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k3} +; AVX512DQ-BW-NEXT: movb $24, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm16[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512DQ-BW-NEXT: movb $-31, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k4} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k4} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: movb $12, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} -; AVX512DQ-BW-NEXT: movb $-31, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} -; AVX512DQ-BW-NEXT: movb $112, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm27 {%k3} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm13 {%k4} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k4} ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k4} ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: movb $6, %sil +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} +; AVX512DQ-BW-NEXT: movb $112, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm13 {%k4} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3} +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm10 {%k4} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm18 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm19, %zmm0 -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm13 {%k3} -; AVX512DQ-BW-NEXT: movb $56, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} -; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k4} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} -; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k3} -; AVX512DQ-BW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4} -; AVX512DQ-BW-NEXT: movb $64, %cl +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm27 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm3 {%k4} +; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: movb $6, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k4 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} +; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k4} +; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} +; AVX512DQ-BW-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQ-BW-NEXT: movb $56, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm24 {%k4} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%r8), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k4} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm6 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,11,0,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,11,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa 192(%r8), %ymm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm15 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: movb $14, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-NEXT: movb $64, %cl +; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2} ; AVX512DQ-BW-NEXT: movb $8, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,12,0,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm22, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [12,0,0,3,4,5,6,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm11, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm15, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm11, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 1472(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 1344(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1152(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1088(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1344(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 1280(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 1152(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 832(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 960(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 896(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 832(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 704(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 640(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 512(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 1728(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 1536(%rax) -; AVX512DQ-BW-NEXT: addq $2184, %rsp # imm = 0x888 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1728(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 1600(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 1536(%rax) +; AVX512DQ-BW-NEXT: addq $1928, %rsp # imm = 0x788 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride7_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $2088, %rsp # imm = 0x828 +; AVX512DQ-BW-FCP-NEXT: subq $1864, %rsp # imm = 0x748 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm13 ; AVX512DQ-BW-FCP-NEXT: movb $96, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,10,0,0,0,0,0,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r8), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm1[0],ymm17[2],ymm1[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r9), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm16[0],ymm3[0],ymm16[2],ymm3[2] ; AVX512DQ-BW-FCP-NEXT: movb $28, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm0, %ymm5, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,3,7,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm3, %ymm1, %ymm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,0,0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [6,0,0,0,0,13,14,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm4, %ymm1, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm21[0],ymm0[0],ymm21[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm0, %ymm1, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm2[2,3,2,3],zmm25[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm4 -; AVX512DQ-BW-FCP-NEXT: movb $48, %r10b -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm18[0],zmm7[0],zmm18[2],zmm7[2],zmm18[4],zmm7[4],zmm18[6],zmm7[6] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm18, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm7, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm13, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm5[2,3,2,3],zmm21[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm8[0],zmm26[0],zmm8[2],zmm26[2],zmm8[4],zmm26[4],zmm8[6],zmm26[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm9[0],zmm24[0],zmm9[2],zmm24[2],zmm9[4],zmm24[4],zmm9[6],zmm24[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm19, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm29, %zmm23, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,11,0,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,11,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm17, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: movb $24, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: movb $14, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm26 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k4} -; AVX512DQ-BW-FCP-NEXT: movb $120, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: movb $-61, %sil +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [3,0,0,0,0,0,12,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm30, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,5,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm27, %zmm0, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm27, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm27, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm23, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: movb $48, %r10b +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,0,0,0,0,0,10,2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm9[0],zmm19[0],zmm9[2],zmm19[2],zmm9[4],zmm19[4],zmm9[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,5,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm12[0],zmm24[0],zmm12[2],zmm24[2],zmm12[4],zmm24[4],zmm12[6],zmm24[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm13, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm13, %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: movb $14, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: movb $120, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm4 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} +; AVX512DQ-BW-FCP-NEXT: movb $-61, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k5} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: movb $24, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm16[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k4} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm5[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k5} +; AVX512DQ-BW-FCP-NEXT: movb $-31, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm15 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: movb $12, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k4} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 {%k3} -; AVX512DQ-BW-FCP-NEXT: movb $-31, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k3} -; AVX512DQ-BW-FCP-NEXT: movb $112, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm12 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: movb $6, %sil +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512DQ-BW-FCP-NEXT: movb $112, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm5, %zmm22 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm19 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: movb $56, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm11 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: movb $6, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm20 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm9 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: movb $64, %cl +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k4} +; AVX512DQ-BW-FCP-NEXT: movb $56, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm6 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,11,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm10, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%r8), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-BW-FCP-NEXT: movb $64, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $8, %al -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,12,0,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [13,0,2,3,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [12,0,0,3,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm18, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,12,0,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm6, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1472(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 1408(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1344(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 1344(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 1280(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 1216(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 960(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 896(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 832(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 896(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 832(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1600(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 1536(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $2088, %rsp # imm = 0x828 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1664(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 1600(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQ-BW-FCP-NEXT: addq $1864, %rsp # imm = 0x748 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -16900,752 +16492,741 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $6248, %rsp # imm = 0x1868 +; AVX512-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm17 ; AVX512-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,3,11,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,10,0,0,0,0,0,3] ; AVX512-NEXT: movb $96, %r10b ; AVX512-NEXT: kmovw %r10d, %k1 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512-NEXT: vmovdqa64 64(%rax), %zmm6 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm6 +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa (%r9), %ymm7 -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,1,9,0,0,0] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm10, %zmm12, %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,3,4,9,0,0] +; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa (%r9), %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512-NEXT: movb $28, %r10b ; AVX512-NEXT: kmovw %r10d, %k2 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [4,12,0,0,0,0,0,5] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,12,0,0,0,0,7] +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [5,0,0,0,0,0,14,6] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,0,0,0,0,6,7] +; AVX512-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,7,15,0] +; AVX512-NEXT: vpermt2q %zmm10, %zmm26, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [6,0,0,0,0,13,14,7] +; AVX512-NEXT: vpermt2q %zmm2, %zmm27, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512-NEXT: vpermt2q %zmm16, %zmm13, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm9[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm24, %zmm3 +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm4, %zmm20, %zmm3 +; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 +; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512-NEXT: vpermt2q %zmm16, %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm26, %zmm2 +; AVX512-NEXT: vpermt2q %zmm2, %zmm27, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm31 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm17 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512-NEXT: vmovdqa 128(%r8), %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm28, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512-NEXT: vmovdqa 192(%r9), %ymm5 -; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm19, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm26, %zmm2 +; AVX512-NEXT: vpermt2q %zmm2, %zmm27, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm16 +; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 192(%rax), %zmm25 +; AVX512-NEXT: vmovdqa 192(%r9), %ymm9 +; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm2[2,3,2,3],zmm25[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm6 +; AVX512-NEXT: vpermt2q %zmm25, %zmm8, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm20, %zmm4 +; AVX512-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm17, %zmm1 -; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm12 -; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512-NEXT: vpermt2q %zmm23, %zmm31, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm4 +; AVX512-NEXT: vpermt2q %zmm25, %zmm19, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512-NEXT: vpermt2q %zmm2, %zmm27, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm18, %zmm13, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512-NEXT: vmovdqa 256(%r9), %ymm7 -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 256(%r8), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqa 256(%r9), %ymm6 +; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 256(%r8), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512-NEXT: vmovdqa64 256(%r8), %zmm4 ; AVX512-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm11 -; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm5 -; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm5 ; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 -; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm11 -; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512-NEXT: vmovdqa 320(%r9), %ymm5 -; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 320(%r8), %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%r8), %zmm5 -; AVX512-NEXT: vmovdqa64 320(%r9), %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm23, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm19, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm26, %zmm4 +; AVX512-NEXT: vpermt2q %zmm4, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2q %zmm14, %zmm25, %zmm5 -; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm24, %zmm25, %zmm0 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512-NEXT: vpermt2q %zmm5, %zmm20, %zmm6 +; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512-NEXT: vpermt2q %zmm28, %zmm13, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512-NEXT: vmovdqa64 320(%r9), %ymm25 +; AVX512-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 320(%r8), %ymm6 +; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm25[0],ymm6[2],ymm25[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm6[2,3,2,3],zmm2[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 320(%r8), %zmm6 +; AVX512-NEXT: vmovdqa64 320(%r9), %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm6 +; AVX512-NEXT: vpermt2q %zmm20, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm10 -; AVX512-NEXT: movb $24, %r10b -; AVX512-NEXT: kmovw %r10d, %k3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm20, %zmm7, %zmm19 +; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] -; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm6, %zmm27, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm7 +; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [3,0,0,0,0,0,12,4] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,5,13,0] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,15,0,0,0] +; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-NEXT: vpermt2q %zmm24, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-NEXT: vpermt2q %zmm28, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,5,13,5,13,5,13,5] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm7, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512-NEXT: vpermt2q %zmm8, %zmm13, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm26, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2q %zmm18, %zmm26, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2q %zmm18, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-NEXT: vpermt2q %zmm28, %zmm26, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-NEXT: vpermt2q %zmm28, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512-NEXT: vpermi2q %zmm29, %zmm6, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm29, %zmm6, %zmm26 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm29, %zmm6, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm29 +; AVX512-NEXT: movb $48, %r10b +; AVX512-NEXT: kmovw %r10d, %k3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,0,0,10,2] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,0,1] +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm21 +; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,5,13,0,0,0] +; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,0,6,14] +; AVX512-NEXT: vpermt2q %zmm3, %zmm15, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,15,0] +; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm21, %zmm2, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm15, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm31[0],zmm4[0],zmm31[2],zmm4[2],zmm31[4],zmm4[4],zmm31[6],zmm4[6] +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm31 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm7, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm31 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm25, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512-NEXT: vpermt2q %zmm22, %zmm2, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm22, %zmm14, %zmm31 +; AVX512-NEXT: vpermt2q %zmm22, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm8 -; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512-NEXT: vpermi2q %zmm24, %zmm5, %zmm31 -; AVX512-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm7, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm8 -; AVX512-NEXT: movb $48, %r10b -; AVX512-NEXT: kmovw %r10d, %k4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm7, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm12[0],zmm10[0],zmm12[2],zmm10[2],zmm12[4],zmm10[4],zmm12[6],zmm10[6] +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm30 +; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm30 +; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm28 +; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm27, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm27[0],zmm0[0],zmm27[2],zmm0[2],zmm27[4],zmm0[4],zmm27[6],zmm0[6] -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm27 -; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm23 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm29 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm22[0],zmm15[2],zmm22[2],zmm15[4],zmm22[4],zmm15[6],zmm22[6] -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm22, %zmm4, %zmm27 -; AVX512-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm10, %zmm7, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm24 +; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm25, %zmm26 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm22 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512-NEXT: vpermt2q %zmm19, %zmm5, %zmm24 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,3,11,0] +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512-NEXT: vpermi2q %zmm12, %zmm19, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 +; AVX512-NEXT: vpermi2q %zmm12, %zmm19, %zmm15 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm21 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm18 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm28, %zmm5, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512-NEXT: vpermi2q %zmm19, %zmm28, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512-NEXT: vpermi2q %zmm19, %zmm28, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512-NEXT: vpermi2q %zmm19, %zmm28, %zmm10 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm19[0],zmm28[2],zmm19[2],zmm28[4],zmm19[4],zmm28[6],zmm19[6] -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm4, %zmm28 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] -; AVX512-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 -; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm19 +; AVX512-NEXT: vpermt2q %zmm12, %zmm7, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512-NEXT: vpermi2q %zmm13, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: # zmm8 = zmm3[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [12,0,0,3,4,5,6,13] -; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 {%k3} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm17 -; AVX512-NEXT: movb $6, %sil -; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,0,0,6,7] -; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512-NEXT: movb $64, %sil -; AVX512-NEXT: kmovw %esi, %k5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 {%k5} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,11,0,0,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm3, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512-NEXT: movb $4, %sil +; AVX512-NEXT: kmovw %esi, %k3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25 {%k3} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,10,0,5,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm25, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,12,0,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: movb $24, %sil ; AVX512-NEXT: kmovw %esi, %k5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 {%k5} -; AVX512-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm9, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [13,0,2,3,4,5,6,14] -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm18 {%k5} +; AVX512-NEXT: movb $6, %sil +; AVX512-NEXT: kmovw %esi, %k3 +; AVX512-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,9,0,0,6,7] +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 {%k5} +; AVX512-NEXT: movb $64, %sil +; AVX512-NEXT: kmovw %esi, %k4 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512-NEXT: vmovdqa64 384(%r8), %zmm3 +; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm18 +; AVX512-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512-NEXT: vpermi2q %zmm1, %zmm3, %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,9,0,0,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [4,12,0,0,0,0,0,5] +; AVX512-NEXT: vpermi2q %zmm1, %zmm3, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [5,0,0,0,0,0,14,6] +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 ; AVX512-NEXT: movb $12, %sil -; AVX512-NEXT: kmovw %esi, %k5 -; AVX512-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} -; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm2 +; AVX512-NEXT: kmovw %esi, %k4 +; AVX512-NEXT: vmovdqa 448(%rdx), %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,0,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] +; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,9,0,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm12, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [13,0,2,3,4,5,6,14] +; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm12 +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 +; AVX512-NEXT: vmovdqa64 448(%rax), %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa 384(%r9), %ymm12 -; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 384(%r8), %ymm8 -; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,3,4,9,0,0] +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm9 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa 384(%r9), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 384(%r8), %ymm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,0,0,0,0,7] +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,0,0,0,0,6,7] +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm18 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm13 ; AVX512-NEXT: movb $8, %sil ; AVX512-NEXT: kmovw %esi, %k2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k5} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 {%k5} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k5} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k5} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k5} ; AVX512-NEXT: movb $-31, %sil ; AVX512-NEXT: kmovw %esi, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} +; AVX512-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k4} +; AVX512-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} -; AVX512-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} -; AVX512-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k4} +; AVX512-NEXT: vmovdqa 256(%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 {%k4} +; AVX512-NEXT: vmovdqa 320(%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4} +; AVX512-NEXT: vmovdqa 384(%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k4} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} -; AVX512-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} -; AVX512-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} -; AVX512-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 +; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 ; AVX512-NEXT: movb $112, %sil ; AVX512-NEXT: kmovw %esi, %k2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm30 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 {%k2} +; AVX512-NEXT: vinserti32x4 $3, 384(%rax), %zmm25, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} +; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} -; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} +; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k3} +; AVX512-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} +; AVX512-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} +; AVX512-NEXT: vpbroadcastq 264(%rcx), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} -; AVX512-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} -; AVX512-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k3} +; AVX512-NEXT: vpbroadcastq 328(%rcx), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 {%k3} +; AVX512-NEXT: vpbroadcastq 392(%rcx), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm21 {%k3} ; AVX512-NEXT: movb $56, %cl ; AVX512-NEXT: kmovw %ecx, %k2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm21 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm20 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} @@ -17653,50 +17234,46 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512-NEXT: movb $120, %cl ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: movb $-61, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload ; AVX512-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -17705,106 +17282,108 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 64-byte Folded Reload -; AVX512-NEXT: # zmm7 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: # zmm8 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: movb $14, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm10 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm15 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm24 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 2944(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 3008(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 2944(%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, 2880(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, 2816(%rax) +; AVX512-NEXT: vmovdqa64 %zmm21, 2752(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 2688(%rax) ; AVX512-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 2496(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 2432(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 2240(%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 2560(%rax) +; AVX512-NEXT: vmovdqa64 %zmm28, 2496(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 2432(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm7, 2368(%rax) +; AVX512-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 2240(%rax) +; AVX512-NEXT: vmovdqa64 %zmm27, 2176(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 2112(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512-NEXT: vmovdqa64 %zmm31, 2048(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm5, 1920(%rax) -; AVX512-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, 1792(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 1792(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm4, 1472(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 1216(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 1152(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 1088(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17813,9 +17392,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 640(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17824,16 +17403,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, 3520(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 3520(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17844,808 +17423,794 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 3072(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512-NEXT: addq $6248, %rsp # imm = 0x1868 +; AVX512-NEXT: addq $6280, %rsp # imm = 0x1888 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $6120, %rsp # imm = 0x17E8 +; AVX512-FCP-NEXT: subq $6184, %rsp # imm = 0x1828 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] -; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,3,11,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,10,0,0,0,0,0,3] ; AVX512-FCP-NEXT: movb $96, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,9,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,3,4,9,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm8 -; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %ymm16 +; AVX512-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm6 +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512-FCP-NEXT: movb $28, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,12,7,0,1,12,7] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [4,12,0,0,0,0,0,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,12,0,0,0,0,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] -; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,0,0,0,0,0,14,6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,0,0,0,0,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,7,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [6,0,0,0,0,13,14,7] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm28[0],ymm16[0],ymm28[2],ymm16[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm4[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm29, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm27, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm7 -; AVX512-FCP-NEXT: vmovdqa 128(%r9), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm13, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm24 +; AVX512-FCP-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %ymm23 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm0[0],ymm23[2],ymm0[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm3[2,3,2,3],zmm24[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %ymm29 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm29[0],ymm19[0],ymm29[2],ymm19[2] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm14 -; AVX512-FCP-NEXT: vmovdqa 192(%r9), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %ymm24 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm29, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512-FCP-NEXT: vmovdqa 256(%r9), %ymm10 -; AVX512-FCP-NEXT: vmovdqa 256(%r8), %ymm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 256(%rax), %zmm27 +; AVX512-FCP-NEXT: vmovdqa 256(%r9), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 256(%r8), %ymm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm8[2,3,2,3],zmm27[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512-FCP-NEXT: vmovdqa 320(%r9), %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %ymm31 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm31[0],ymm6[0],ymm31[2],ymm6[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa 320(%r9), %ymm10 +; AVX512-FCP-NEXT: vmovdqa 320(%r8), %ymm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,7,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermt2q %ymm10, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermt2q %ymm6, %ymm0, %ymm31 -; AVX512-FCP-NEXT: vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 384(%r9), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 384(%r8), %ymm4 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm28 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm23 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpermt2q %ymm19, %ymm0, %ymm29 +; AVX512-FCP-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpermt2q %ymm5, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 -; AVX512-FCP-NEXT: movb $24, %r10b -; AVX512-FCP-NEXT: kmovw %r10d, %k3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k3} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm20, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %ymm10, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [3,0,0,0,0,0,12,4] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,13,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,15,0,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm11 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm30 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm31, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm2, %zmm23 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm31, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm31, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 ; AVX512-FCP-NEXT: movb $48, %r10b -; AVX512-FCP-NEXT: kmovw %r10d, %k4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: kmovw %r10d, %k3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,0,0,0,0,1] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,0,0,10,2] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm16 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,5,13,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,6,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,7,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm2[0],zmm30[0],zmm2[2],zmm30[2],zmm2[4],zmm30[4],zmm2[6],zmm30[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm3[0],zmm28[0],zmm3[2],zmm28[2],zmm3[4],zmm28[4],zmm3[6],zmm28[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm2[0],zmm26[0],zmm2[2],zmm26[2],zmm2[4],zmm26[4],zmm2[6],zmm26[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm0[0],zmm18[0],zmm0[2],zmm18[2],zmm0[4],zmm18[4],zmm0[6],zmm18[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm22, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm12[0],zmm0[2],zmm12[2],zmm0[4],zmm12[4],zmm0[6],zmm12[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,3,11,0] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm24 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm12[0],zmm14[0],zmm12[2],zmm14[2],zmm12[4],zmm14[4],zmm12[6],zmm14[6] +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm10[0],zmm5[0],zmm10[2],zmm5[2],zmm10[4],zmm5[4],zmm10[6],zmm5[6] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm10, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,11,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm28, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,11,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm15 +; AVX512-FCP-NEXT: movb $4, %sil +; AVX512-FCP-NEXT: kmovw %esi, %k3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k3} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,10,0,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [12,0,0,3,4,5,6,13] +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm17 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k4} = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm31, %zmm26 -; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm23 -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm14[0],zmm18[0],zmm14[2],zmm18[2],zmm14[4],zmm18[4],zmm14[6],zmm18[6] -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [12,0,0,3,4,5,6,13] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm15 {%k3} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: movb $24, %sil +; AVX512-FCP-NEXT: kmovw %esi, %k4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k4} ; AVX512-FCP-NEXT: movb $6, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k5 -; AVX512-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k5} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,9,0,0,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k4} ; AVX512-FCP-NEXT: movb $64, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} -; AVX512-FCP-NEXT: movb $4, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,11,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,12,0,3,4,5,6,7] +; AVX512-FCP-NEXT: kmovw %esi, %k3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k3} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,15,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,9,0,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,0,0,0,0,0,5] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [13,0,2,3,4,5,6,14] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [5,0,0,0,0,0,14,6] +; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm0, %zmm28 ; AVX512-FCP-NEXT: movb $12, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %xmm1 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,9,0,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} -; AVX512-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: kmovw %esi, %k3 +; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %xmm11 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm6 {%k3} +; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,8,0,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,9,0,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm16 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm19 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} +; AVX512-FCP-NEXT: vmovdqa64 448(%rax), %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,3,4,9,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,0,0,0,0,7] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,0,0,0,0,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm10 ; AVX512-FCP-NEXT: movb $8, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k4} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k3} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k4} ; AVX512-FCP-NEXT: movb $-31, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4} -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} -; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm8 {%k4} -; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k4} -; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} -; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k4} -; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} +; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} +; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k3} +; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 {%k3} +; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k3} +; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 {%k3} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 ; AVX512-FCP-NEXT: movb $112, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512-FCP-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512-FCP-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512-FCP-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 {%k2} +; AVX512-FCP-NEXT: vinserti32x4 $3, 384(%rax), %zmm4, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2} ; AVX512-FCP-NEXT: movb $56, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm25 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: movb $14, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2} -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm21 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k2} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k2} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k2} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -18656,38 +18221,43 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} ; AVX512-FCP-NEXT: movb $120, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} ; AVX512-FCP-NEXT: movb $-61, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] @@ -18696,855 +18266,833 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm9 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 3008(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 2944(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 2880(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 2752(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 2688(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm7, 2624(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 2560(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 3008(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 2944(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 2880(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm9, 2816(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 2752(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 2688(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm9, 2624(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 2560(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm7, 2368(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 2304(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 2432(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, 2368(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 2304(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 2112(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 2112(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 1984(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm7, 1920(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 1856(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1792(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1856(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 1792(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 1664(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1536(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm7, 1472(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 1408(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm5, 1472(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm5, 1024(%rax) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1152(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm4, 1024(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm4, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 3520(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 3520(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 3456(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 3328(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 3264(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm0, 3264(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 3200(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 3072(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512-FCP-NEXT: addq $6120, %rsp # imm = 0x17E8 +; AVX512-FCP-NEXT: addq $6184, %rsp # imm = 0x1828 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $6280, %rsp # imm = 0x1888 +; AVX512DQ-NEXT: subq $6216, %rsp # imm = 0x1848 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,3,11,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,10,0,0,0,0,0,3] ; AVX512DQ-NEXT: movb $96, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k1 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm8, %zmm2 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,1,9,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,3,4,9,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm5 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512DQ-NEXT: movb $28, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,0,0,0,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,12,0,0,0,0,7] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm28, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [5,0,0,0,0,0,14,6] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,13,0,0,0,0,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,7,15,0] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [6,0,0,0,0,13,14,7] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm26, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm15, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm9[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm11, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm24, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm26, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512DQ-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa 128(%r8), %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm6, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm26, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm27 +; AVX512DQ-NEXT: vmovdqa 192(%r9), %ymm9 +; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm2[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm15, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm30, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm20, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm26, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm13, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 256(%rax), %zmm23 +; AVX512DQ-NEXT: vmovdqa 256(%r9), %ymm6 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm23[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqa 192(%r9), %ymm8 -; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 256(%r9), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm15, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqa 256(%r9), %ymm12 -; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 256(%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 256(%r9), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm28, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm25, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm31, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512DQ-NEXT: vmovdqa 320(%r9), %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 320(%r8), %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm30, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm25, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm27, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm20, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm26, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm13, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 320(%r9), %ymm23 +; AVX512DQ-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 320(%r8), %ymm6 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm23[0],ymm6[2],ymm23[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k2} = zmm6[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm25, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm17, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm14, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 -; AVX512DQ-NEXT: movb $24, %r10b -; AVX512DQ-NEXT: kmovw %r10d, %k3 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 -; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm1, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [3,0,0,0,0,0,12,4] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,5,13,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,15,0,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm26, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm26, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm26, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm26, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm29, %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm29, %zmm6, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm29, %zmm6, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm29 +; AVX512DQ-NEXT: movb $48, %r10b +; AVX512DQ-NEXT: kmovw %r10d, %k3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,0,0,10,2] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm19[0],zmm3[0],zmm19[2],zmm3[2],zmm19[4],zmm3[4],zmm19[6],zmm3[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,0,1] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,5,13,0,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm25, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,0,6,14] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm15, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,15,0] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm2, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm13, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm15, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k3} = zmm31[0],zmm22[0],zmm31[2],zmm22[2],zmm31[4],zmm22[4],zmm31[6],zmm22[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm13, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm7, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm11[0],zmm18[0],zmm11[2],zmm18[2],zmm11[4],zmm18[4],zmm11[6],zmm18[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm13, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm2, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm9, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm7 -; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm5, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm10 -; AVX512DQ-NEXT: movb $48, %r10b -; AVX512DQ-NEXT: kmovw %r10d, %k4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm7, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm1, %zmm30 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm12[0],zmm21[0],zmm12[2],zmm21[2],zmm12[4],zmm21[4],zmm12[6],zmm21[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm13, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm7, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm24 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] ; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k4} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm29 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm4, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm13, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm25, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm5, %zmm24 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k4} = zmm16[0],zmm3[0],zmm16[2],zmm3[2],zmm16[4],zmm3[4],zmm16[6],zmm3[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm5, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm16, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm16, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm16, %zmm9 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm4, %zmm28 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,3,11,0] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm1, %zmm19 +; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm18, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm18, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm18, %zmm15 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm7, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm0[0],zmm10[2],zmm0[2],zmm10[4],zmm0[4],zmm10[6],zmm0[6] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm23 {%k1} +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm8 = zmm3[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k3} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 -; AVX512DQ-NEXT: movb $6, %sil -; AVX512DQ-NEXT: kmovw %esi, %k4 -; AVX512DQ-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,0,0,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-NEXT: movb $64, %sil -; AVX512DQ-NEXT: kmovw %esi, %k5 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 {%k5} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,11,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512DQ-NEXT: movb $4, %sil +; AVX512DQ-NEXT: kmovw %esi, %k3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm23 {%k3} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,10,0,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm23, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,12,0,3,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: movb $24, %sil ; AVX512DQ-NEXT: kmovw %esi, %k5 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 {%k5} -; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 {%k5} +; AVX512DQ-NEXT: movb $6, %sil +; AVX512DQ-NEXT: kmovw %esi, %k3 +; AVX512DQ-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,9,0,0,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 {%k5} +; AVX512DQ-NEXT: movb $64, %sil +; AVX512DQ-NEXT: kmovw %esi, %k4 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm3, %zmm23 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,9,0,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [4,12,0,0,0,0,0,5] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [5,0,0,0,0,0,14,6] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 ; AVX512DQ-NEXT: movb $12, %sil -; AVX512DQ-NEXT: kmovw %esi, %k5 -; AVX512DQ-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} -; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 +; AVX512DQ-NEXT: kmovw %esi, %k4 +; AVX512DQ-NEXT: vmovdqa 448(%rdx), %xmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,0,7] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,9,0,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [13,0,2,3,4,5,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 448(%rax), %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa 384(%r9), %ymm12 -; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 384(%r8), %ymm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,3,4,9,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa 384(%r9), %ymm8 ; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 384(%r8), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,0,0,0,0,7] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,0,0,0,0,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm14 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm10 ; AVX512DQ-NEXT: movb $8, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k5} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 {%k5} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k5} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k5} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 {%k5} ; AVX512DQ-NEXT: movb $-31, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} -; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} -; AVX512DQ-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} -; AVX512DQ-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} -; AVX512DQ-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512DQ-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm27 {%k4} +; AVX512DQ-NEXT: vmovdqa 320(%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k4} +; AVX512DQ-NEXT: vmovdqa 384(%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm18 {%k4} ; AVX512DQ-NEXT: movb $112, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm1, %zmm0 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm2 {%k2} +; AVX512DQ-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm3 {%k2} +; AVX512DQ-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm3 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm27 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm21 {%k2} +; AVX512DQ-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2} +; AVX512DQ-NEXT: vinserti64x2 $3, 384(%rax), %zmm23, %zmm18 {%k2} +; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 384(%rax), %zmm0, %zmm28 {%k2} -; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm19 {%k3} ; AVX512DQ-NEXT: movb $56, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm20 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm19 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -19555,33 +19103,31 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-NEXT: movb $120, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} ; AVX512DQ-NEXT: movb $-61, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -19592,10 +19138,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -19604,96 +19150,96 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm7 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm7 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm8 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm8 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: movb $14, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm15 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 2944(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 2688(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 2624(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 2496(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 2432(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 3008(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 2944(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 2880(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, 2816(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 2752(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 2688(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 2624(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 2560(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 2496(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 2432(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm7, 2368(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 2240(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 2240(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 2176(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 2112(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 2048(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm5, 1920(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1856(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 1792(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm4, 1472(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19702,20 +19248,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1088(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1216(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1088(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm3, 1024(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 640(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19725,15 +19271,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 128(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 3520(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 3520(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19744,11 +19290,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 3072(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 3072(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-NEXT: addq $6280, %rsp # imm = 0x1888 +; AVX512DQ-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -19758,831 +19303,812 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,3,11,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,10,0,0,0,0,0,3] ; AVX512DQ-FCP-NEXT: movb $96, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,1,9,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,3,4,9,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512DQ-FCP-NEXT: movb $28, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [4,12,0,0,0,0,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,12,0,0,0,0,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [5,0,0,0,0,0,14,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,13,0,0,0,0,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,7,15,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [6,0,0,0,0,13,14,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm28[0],ymm16[0],ymm28[2],ymm16[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %ymm30 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm4[2,3,2,3],zmm25[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %ymm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%r8), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm20[0],ymm13[2],ymm20[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%r9), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%r8), %ymm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rax), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%r9), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm7[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r8), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%r9), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%r8), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rax), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%r9), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %ymm24, %ymm0, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %ymm15, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %ymm9, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 384(%r9), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %ymm4, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm30 -; AVX512DQ-FCP-NEXT: movb $24, %r10b -; AVX512DQ-FCP-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm28 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %ymm20, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %ymm5, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %ymm10, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [3,0,0,0,0,0,12,4] +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,5,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,15,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,0,0,0,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm23 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: movb $48, %r10b -; AVX512DQ-FCP-NEXT: kmovw %r10d, %k4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-FCP-NEXT: kmovw %r10d, %k3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,0,0,0,0,1] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,0,0,10,2] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,5,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,7,15,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k3} = zmm2[0],zmm30[0],zmm2[2],zmm30[2],zmm2[4],zmm30[4],zmm2[6],zmm30[6] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k3} = zmm2[0],zmm26[0],zmm2[2],zmm26[2],zmm2[4],zmm26[4],zmm2[6],zmm26[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm0[0],zmm12[0],zmm0[2],zmm12[2],zmm0[4],zmm12[4],zmm0[6],zmm12[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm26[0],zmm12[2],zmm26[2],zmm12[4],zmm26[4],zmm12[6],zmm26[6] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,3,11,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm15[0],zmm12[2],zmm15[2],zmm12[4],zmm15[4],zmm12[6],zmm15[6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm11[0],zmm5[0],zmm11[2],zmm5[2],zmm11[4],zmm5[4],zmm11[6],zmm5[6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm2[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,11,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm28, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,11,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: movb $4, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k3} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,10,0,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm10, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [12,0,0,3,4,5,6,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,12,0,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm10, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm28[0],zmm0[2],zmm28[2],zmm0[4],zmm28[4],zmm0[6],zmm28[6] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k4} = zmm31[0],zmm21[0],zmm31[2],zmm21[2],zmm31[4],zmm21[4],zmm31[6],zmm21[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm23, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[6] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: movb $24, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k4} ; AVX512DQ-FCP-NEXT: movb $6, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k5 -; AVX512DQ-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k5} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,9,0,0,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} ; AVX512DQ-FCP-NEXT: movb $64, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} -; AVX512DQ-FCP-NEXT: movb $4, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,11,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 +; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k3} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,15,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,1,9,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,0,0,0,0,0,5] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,0,0,0,0,0,14,6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movb $12, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,9,0,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,0,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,9,0,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} +; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} ; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} ; AVX512DQ-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} ; AVX512DQ-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} ; AVX512DQ-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k5} ; AVX512DQ-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rax), %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rax), %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,3,4,9,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,0,0,0,0,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,0,0,0,0,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm11 ; AVX512DQ-FCP-NEXT: movb $8, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k4} ; AVX512DQ-FCP-NEXT: movb $-31, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k4} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm25 {%k3} ; AVX512DQ-FCP-NEXT: movb $112, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm1 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm3, %zmm5 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2} -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 384(%rax), %zmm18, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm31 {%k2} +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 384(%rax), %zmm4, %zmm25 {%k2} ; AVX512DQ-FCP-NEXT: movb $56, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: movb $14, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm18 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k2} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm26 {%k2} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: movb $120, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: movb $-61, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] @@ -20591,69 +20117,68 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm18 = zmm18[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm10 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 3008(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 2944(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 2880(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 2688(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 2624(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 2560(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 3008(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 2944(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 2880(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm10, 2816(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 2752(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 2688(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm10, 2624(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 2560(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 2432(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 2304(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 2240(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 2432(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm9, 2368(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 2240(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 2112(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 2112(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1984(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1920(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 1856(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1664(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1472(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1664(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1600(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm7, 1472(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 1024(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -20663,8 +20188,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 576(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -20673,26 +20198,26 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 3520(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 3456(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 3328(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 3072(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 3072(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3136(%rax) ; AVX512DQ-FCP-NEXT: addq $6120, %rsp # imm = 0x17E8 @@ -20701,752 +20226,741 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-LABEL: store_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6248, %rsp # imm = 0x1868 +; AVX512BW-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,3,11,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,10,0,0,0,0,0,3] ; AVX512BW-NEXT: movb $96, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,1,9,0,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,3,4,9,0,0] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 64(%r8), %ymm4 ; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512BW-NEXT: movb $28, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [4,12,0,0,0,0,0,5] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,12,0,0,0,0,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [5,0,0,0,0,0,14,6] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,0,0,0,0,6,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,7,15,0] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [6,0,0,0,0,13,14,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm9[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512BW-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa 128(%r8), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa 192(%r9), %ymm5 -; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm25 +; AVX512BW-NEXT: vmovdqa 192(%r9), %ymm9 +; AVX512BW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm2[2,3,2,3],zmm25[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm31, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa 256(%r9), %ymm7 -; AVX512BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 256(%r8), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-NEXT: vmovdqa 256(%r9), %ymm6 +; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 256(%r8), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm4 ; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512BW-NEXT: vmovdqa 320(%r9), %ymm5 -; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 320(%r8), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512BW-NEXT: vmovdqa64 320(%r9), %ymm25 +; AVX512BW-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 320(%r8), %ymm6 +; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm25[0],ymm6[2],ymm25[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm6[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm10 -; AVX512BW-NEXT: movb $24, %r10b -; AVX512BW-NEXT: kmovd %r10d, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [3,0,0,0,0,0,12,4] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,5,13,0] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,15,0,0,0] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm29, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm29, %zmm6, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm29, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm29 +; AVX512BW-NEXT: movb $48, %r10b +; AVX512BW-NEXT: kmovd %r10d, %k3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,0,0,10,2] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,0,1] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,5,13,0,0,0] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,0,6,14] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,15,0] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm31[0],zmm4[0],zmm31[2],zmm4[2],zmm31[4],zmm4[4],zmm31[6],zmm4[6] +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm5, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm8 -; AVX512BW-NEXT: movb $48, %r10b -; AVX512BW-NEXT: kmovd %r10d, %k4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm12[0],zmm10[0],zmm12[2],zmm10[2],zmm12[4],zmm10[4],zmm12[6],zmm10[6] +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm27[0],zmm0[0],zmm27[2],zmm0[2],zmm27[4],zmm0[4],zmm27[6],zmm0[6] -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm22[0],zmm15[2],zmm22[2],zmm15[4],zmm22[4],zmm15[6],zmm22[6] -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm7, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm26 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,3,11,0] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm19, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm19, %zmm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm28, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm28, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm28, %zmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm19[0],zmm28[2],zmm19[2],zmm28[4],zmm19[4],zmm28[6],zmm19[6] -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm8 = zmm3[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [12,0,0,3,4,5,6,13] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 {%k3} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm17 -; AVX512BW-NEXT: movb $6, %sil -; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,0,0,6,7] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 -; AVX512BW-NEXT: movb $64, %sil -; AVX512BW-NEXT: kmovd %esi, %k5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k5} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,11,0,0,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512BW-NEXT: movb $4, %sil +; AVX512BW-NEXT: kmovd %esi, %k3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k3} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,10,0,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,12,0,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: movb $24, %sil ; AVX512BW-NEXT: kmovd %esi, %k5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k5} -; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [13,0,2,3,4,5,6,14] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k5} +; AVX512BW-NEXT: movb $6, %sil +; AVX512BW-NEXT: kmovd %esi, %k3 +; AVX512BW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,9,0,0,6,7] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k5} +; AVX512BW-NEXT: movb $64, %sil +; AVX512BW-NEXT: kmovd %esi, %k4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm25 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,9,0,0,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [4,12,0,0,0,0,0,5] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [5,0,0,0,0,0,14,6] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 ; AVX512BW-NEXT: movb $12, %sil -; AVX512BW-NEXT: kmovd %esi, %k5 -; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} -; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm2 +; AVX512BW-NEXT: kmovd %esi, %k4 +; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,0,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,9,0,6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm12, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [13,0,2,3,4,5,6,14] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 384(%r9), %ymm12 -; AVX512BW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 384(%r8), %ymm8 -; AVX512BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,3,4,9,0,0] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 384(%r9), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 384(%r8), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,0,0,0,0,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,0,0,0,0,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm18 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm13 ; AVX512BW-NEXT: movb $8, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k5} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k5} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k5} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k5} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k5} ; AVX512BW-NEXT: movb $-31, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k4} +; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k4} +; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 {%k4} +; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4} +; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k4} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} -; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} -; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} -; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 ; AVX512BW-NEXT: movb $112, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k2} +; AVX512BW-NEXT: vinserti32x4 $3, 384(%rax), %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} +; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} -; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512BW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512BW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} +; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k3} +; AVX512BW-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} +; AVX512BW-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} +; AVX512BW-NEXT: vpbroadcastq 264(%rcx), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512BW-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} -; AVX512BW-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} -; AVX512BW-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k3} +; AVX512BW-NEXT: vpbroadcastq 328(%rcx), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 {%k3} +; AVX512BW-NEXT: vpbroadcastq 392(%rcx), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm21 {%k3} ; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} @@ -21454,50 +20968,46 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: movb $120, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512BW-NEXT: movb $-61, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -21506,106 +21016,108 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm7 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm7 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm8 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: movb $14, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 2944(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 3008(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 2944(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 2880(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2816(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 2752(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 2688(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 2496(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 2432(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 2240(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 2560(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 2496(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 2432(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm7, 2368(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 2240(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 2176(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 2048(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm5, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1792(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm4, 1472(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 1088(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21614,9 +21126,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 640(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21625,16 +21137,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 3520(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 3520(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21645,808 +21157,794 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3072(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-NEXT: addq $6248, %rsp # imm = 0x1868 +; AVX512BW-NEXT: addq $6280, %rsp # imm = 0x1888 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $6120, %rsp # imm = 0x17E8 +; AVX512BW-FCP-NEXT: subq $6184, %rsp # imm = 0x1828 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] -; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,3,11,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,10,0,0,0,0,0,3] ; AVX512BW-FCP-NEXT: movb $96, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,0,1,9,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,3,4,9,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %ymm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512BW-FCP-NEXT: movb $28, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,12,7,0,1,12,7] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [4,12,0,0,0,0,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,12,0,0,0,0,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [5,0,0,0,0,0,14,6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,0,0,0,0,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,7,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [6,0,0,0,0,13,14,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm28[0],ymm16[0],ymm28[2],ymm16[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm27, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa 128(%r9), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm13, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %ymm23 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm0[0],ymm23[2],ymm0[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm3[2,3,2,3],zmm24[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %ymm19 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm29 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm29[0],ymm19[0],ymm29[2],ymm19[2] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa 192(%r9), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm24 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm29, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa 256(%r9), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa 256(%r8), %ymm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa 256(%r9), %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 256(%r8), %ymm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm8[2,3,2,3],zmm27[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa 320(%r9), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %ymm31 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm31[0],ymm6[0],ymm31[2],ymm6[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa 320(%r9), %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa 320(%r8), %ymm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,7,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm17, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %ymm10, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %ymm6, %ymm0, %ymm31 -; AVX512BW-FCP-NEXT: vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 384(%r9), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 384(%r8), %ymm4 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %ymm2, %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm28 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm23 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %ymm19, %ymm0, %ymm29 +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %ymm5, %ymm0, %ymm4 ; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: movb $24, %r10b -; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k3} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm20, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %ymm10, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [3,0,0,0,0,0,12,4] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,5,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,15,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm31, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm2, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm31, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm31, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: movb $48, %r10b -; AVX512BW-FCP-NEXT: kmovd %r10d, %k4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,0,0,0,0,1] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,0,0,10,2] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,5,13,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,6,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,7,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm2[0],zmm30[0],zmm2[2],zmm30[2],zmm2[4],zmm30[4],zmm2[6],zmm30[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm3[0],zmm28[0],zmm3[2],zmm28[2],zmm3[4],zmm28[4],zmm3[6],zmm28[6] +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm2[0],zmm26[0],zmm2[2],zmm26[2],zmm2[4],zmm26[4],zmm2[6],zmm26[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm0[0],zmm18[0],zmm0[2],zmm18[2],zmm0[4],zmm18[4],zmm0[6],zmm18[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm7, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm22, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm12[0],zmm0[2],zmm12[2],zmm0[4],zmm12[4],zmm0[6],zmm12[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,3,11,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm12[0],zmm14[0],zmm12[2],zmm14[2],zmm12[4],zmm14[4],zmm12[6],zmm14[6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm10[0],zmm5[0],zmm10[2],zmm5[2],zmm10[4],zmm5[4],zmm10[6],zmm5[6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm10, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,11,0,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm28, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,11,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: movb $4, %sil +; AVX512BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,10,0,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [12,0,0,3,4,5,6,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm17 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k4} = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm31, %zmm26 -; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm10, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm25 -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm14[0],zmm18[0],zmm14[2],zmm18[2],zmm14[4],zmm18[4],zmm14[6],zmm18[6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [12,0,0,3,4,5,6,13] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm15 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: movb $24, %sil +; AVX512BW-FCP-NEXT: kmovd %esi, %k4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k4} ; AVX512BW-FCP-NEXT: movb $6, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k5 -; AVX512BW-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k5} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,9,0,0,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k4} ; AVX512BW-FCP-NEXT: movb $64, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} -; AVX512BW-FCP-NEXT: movb $4, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,11,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,12,0,3,4,5,6,7] +; AVX512BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,15,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,9,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,0,0,0,0,0,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [13,0,2,3,4,5,6,14] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [5,0,0,0,0,0,14,6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm0, %zmm28 ; AVX512BW-FCP-NEXT: movb $12, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,9,0,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm11 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm6 {%k3} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,8,0,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,9,0,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm16 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm19 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,3,4,9,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,0,0,0,0,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,0,0,0,0,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm31 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: movb $8, %sil +; AVX512BW-FCP-NEXT: kmovd %esi, %k2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k4} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: movb $8, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k4} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k4} ; AVX512BW-FCP-NEXT: movb $-31, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm8 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: movb $112, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 {%k2} +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 384(%rax), %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2} ; AVX512BW-FCP-NEXT: movb $56, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm25 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: movb $14, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm21 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k2} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -22457,38 +21955,43 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: movb $120, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: movb $-61, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] @@ -22497,855 +22000,833 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm9 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm9 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 3008(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 2944(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 2880(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 2752(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 2688(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm7, 2624(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 2560(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 3008(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 2944(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 2880(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm9, 2816(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 2752(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 2688(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm9, 2624(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 2560(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm7, 2368(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 2304(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 2432(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2368(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 2304(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 2112(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 2112(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 1984(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm7, 1920(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 1856(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1792(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1856(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 1792(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 1664(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1536(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm7, 1472(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 1472(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm5, 1024(%rax) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1152(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm4, 1024(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm4, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 704(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm3, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 3520(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 3520(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 3456(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 3328(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3264(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3264(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 3200(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 3072(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-FCP-NEXT: addq $6120, %rsp # imm = 0x17E8 +; AVX512BW-FCP-NEXT: addq $6184, %rsp # imm = 0x1828 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride7_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $6280, %rsp # imm = 0x1888 +; AVX512DQ-BW-NEXT: subq $6216, %rsp # imm = 0x1848 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,10,0,0,0,0,0,3] ; AVX512DQ-BW-NEXT: movb $96, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm2 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm5 ; AVX512DQ-BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512DQ-BW-NEXT: movb $28, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,0,0,7,15,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [6,0,0,0,0,13,14,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm9[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa 128(%r8), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa 192(%r9), %ymm9 +; AVX512DQ-BW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm2[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm30, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rax), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa 256(%r9), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm23[2,3,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa 192(%r9), %ymm8 -; AVX512DQ-BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%r9), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa 256(%r9), %ymm12 -; AVX512DQ-BW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 256(%r8), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%r9), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa 320(%r9), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 320(%r8), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm30, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm20, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %ymm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 320(%r8), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm23[0],ymm6[2],ymm23[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k2} = zmm6[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 -; AVX512DQ-BW-NEXT: movb $24, %r10b -; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [3,0,0,0,0,0,12,4] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,5,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm26, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm29, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm29, %zmm6, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm29, %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm29 +; AVX512DQ-BW-NEXT: movb $48, %r10b +; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,0,0,10,2] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm19[0],zmm3[0],zmm19[2],zmm3[2],zmm19[4],zmm3[4],zmm19[6],zmm3[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,0,0,0,0,1] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,5,13,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,7,15,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k3} = zmm31[0],zmm22[0],zmm31[2],zmm22[2],zmm31[4],zmm22[4],zmm31[6],zmm22[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm11[0],zmm18[0],zmm11[2],zmm18[2],zmm11[4],zmm18[4],zmm11[6],zmm18[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm13, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm5, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm10 -; AVX512DQ-BW-NEXT: movb $48, %r10b -; AVX512DQ-BW-NEXT: kmovd %r10d, %k4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm12[0],zmm21[0],zmm12[2],zmm21[2],zmm12[4],zmm21[4],zmm12[6],zmm21[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm24 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k4} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm24 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k4} = zmm16[0],zmm3[0],zmm16[2],zmm3[2],zmm16[4],zmm3[4],zmm16[6],zmm3[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm16, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm16, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm16, %zmm9 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm28 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm18, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm18, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm18, %zmm15 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm0[0],zmm10[2],zmm0[2],zmm10[4],zmm0[4],zmm10[6],zmm0[6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm8 = zmm3[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: movb $6, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,0,0,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: movb $64, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k5} -; AVX512DQ-BW-NEXT: movb $4, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k5} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,11,0,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: movb $12, %sil +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: movb $4, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k3} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,10,0,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm23, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,12,0,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: movb $24, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 -; AVX512DQ-BW-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 384(%r9), %ymm12 -; AVX512DQ-BW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 384(%r8), %ymm8 -; AVX512DQ-BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k5} +; AVX512DQ-BW-NEXT: movb $6, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,9,0,0,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k5} +; AVX512DQ-BW-NEXT: movb $64, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: movb $12, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k4 +; AVX512DQ-BW-NEXT: vmovdqa 448(%rdx), %xmm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,0,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,9,0,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rax), %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 384(%r9), %ymm8 +; AVX512DQ-BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 384(%r8), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: movb $8, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k5} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k5} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k5} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k5} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k5} ; AVX512DQ-BW-NEXT: movb $-31, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm27 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm18 {%k4} ; AVX512DQ-BW-NEXT: movb $112, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 384(%rax), %zmm23, %zmm18 {%k2} +; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 384(%rax), %zmm0, %zmm28 {%k2} -; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm19 {%k3} ; AVX512DQ-BW-NEXT: movb $56, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -23356,33 +22837,31 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: movb $120, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: movb $-61, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -23393,10 +22872,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -23405,96 +22884,96 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm7 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm7 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm8 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: movb $14, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 2944(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 2688(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 2624(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 2496(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 2432(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 3008(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 2944(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 2880(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2816(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 2752(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 2688(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 2624(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 2560(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 2496(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 2432(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm7, 2368(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 2240(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 2240(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 2176(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 2112(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 2048(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm5, 1920(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1856(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 1792(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm4, 1472(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23503,20 +22982,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1216(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1088(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm3, 1024(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 640(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23526,15 +23005,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 128(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 3520(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 3520(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23545,11 +23024,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3072(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 3072(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-BW-NEXT: addq $6280, %rsp # imm = 0x1888 +; AVX512DQ-BW-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -23559,831 +23037,812 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,10,0,0,0,0,0,3] ; AVX512DQ-BW-FCP-NEXT: movb $96, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512DQ-BW-FCP-NEXT: movb $28, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm22, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [6,0,0,0,0,13,14,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm28[0],ymm16[0],ymm28[2],ymm16[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm4[2,3,2,3],zmm25[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%r8), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm20[0],ymm13[2],ymm20[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%r9), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%r8), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%r9), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%r8), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm7[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm22, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%r9), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%r8), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%r9), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%r8), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm24, %ymm0, %ymm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm15, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm9, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%r9), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm28 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm20, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm5, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm10, %ymm0, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [3,0,0,0,0,0,12,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,5,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,7,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,0,0,0,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm2, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: movb $48, %r10b +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,0,0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,0,0,10,2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,5,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm22 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm4, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm30 -; AVX512DQ-BW-FCP-NEXT: movb $24, %r10b -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k3} = zmm2[0],zmm30[0],zmm2[2],zmm30[2],zmm2[4],zmm30[4],zmm2[6],zmm30[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k3} = zmm2[0],zmm26[0],zmm2[2],zmm26[2],zmm2[4],zmm26[4],zmm2[6],zmm26[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: movb $48, %r10b -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm0[0],zmm12[0],zmm0[2],zmm12[2],zmm0[4],zmm12[4],zmm0[6],zmm12[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,0,3,11,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm15[0],zmm12[2],zmm15[2],zmm12[4],zmm15[4],zmm12[6],zmm15[6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm11[0],zmm5[0],zmm11[2],zmm5[2],zmm11[4],zmm5[4],zmm11[6],zmm5[6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm2[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,11,0,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm28, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,11,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: movb $4, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,10,0,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm10, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [12,0,0,3,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,12,0,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm10, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,8,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm26[0],zmm12[2],zmm26[2],zmm12[4],zmm26[4],zmm12[6],zmm26[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm28[0],zmm0[2],zmm28[2],zmm0[4],zmm28[4],zmm0[6],zmm28[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k4} = zmm31[0],zmm21[0],zmm31[2],zmm21[2],zmm31[4],zmm21[4],zmm31[6],zmm21[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm23, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm16 +; AVX512DQ-BW-FCP-NEXT: movb $24, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k4} ; AVX512DQ-BW-FCP-NEXT: movb $6, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k5 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,9,0,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} ; AVX512DQ-BW-FCP-NEXT: movb $64, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} -; AVX512DQ-BW-FCP-NEXT: movb $4, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,11,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,1,9,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,0,0,0,0,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,0,0,0,0,0,14,6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $12, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,9,0,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,9,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,3,4,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,0,0,0,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,0,0,0,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movb $8, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k4} ; AVX512DQ-BW-FCP-NEXT: movb $-31, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm25 {%k3} ; AVX512DQ-BW-FCP-NEXT: movb $112, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm1 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm3, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 384(%rax), %zmm18, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm31 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 384(%rax), %zmm4, %zmm25 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $56, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: movb $14, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm26 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $120, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $-61, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] @@ -24392,69 +23851,68 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm18 = zmm18[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm10 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm10 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 3008(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 2944(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 2880(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 2688(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 2624(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 2560(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 3008(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 2944(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 2880(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm10, 2816(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 2752(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 2688(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm10, 2624(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 2560(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 2432(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 2304(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 2240(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 2432(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm9, 2368(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 2240(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 2112(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 2112(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1984(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1920(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 1856(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1664(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1472(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1664(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1600(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm7, 1472(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 1024(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -24464,8 +23922,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 576(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -24474,26 +23932,26 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 3520(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 3456(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 3328(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 3072(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 3072(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3136(%rax) ; AVX512DQ-BW-FCP-NEXT: addq $6120, %rsp # imm = 0x17E8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index e837f14d367b2..e4091139296ac 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -649,29 +649,26 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,4,8,12] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,12] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,5,9,13] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,1,5,9,13] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,6,10,14] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,6,10,14] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -692,29 +689,26 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,4,8,12] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,12] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,5,9,13] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,1,5,9,13] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,6,10,14] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,6,10,14] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -735,29 +729,26 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,4,8,12] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,12] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,5,9,13] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,1,5,9,13] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,6,10,14] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,6,10,14] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -778,29 +769,26 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,5,9,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,6,10,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -821,29 +809,26 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,4,8,12] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,12] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,5,9,13] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,1,5,9,13] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,6,10,14] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,6,10,14] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -864,29 +849,26 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,4,8,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,5,9,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,1,5,9,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,6,10,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,6,10,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -907,29 +889,26 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,4,8,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,5,9,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,6,10,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -950,29 +929,26 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1649,104 +1625,99 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512-NEXT: vmovdqa64 (%r11), %zmm1 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512-NEXT: vmovdqa64 (%r11), %zmm2 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,1,9] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: movb $-64, %r8b ; AVX512-NEXT: kmovw %r8d, %k1 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm10 +; AVX512-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm4, %ymm11 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,6,14] +; AVX512-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,14] +; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm2[1],zmm3[1],zmm2[3],zmm3[3],zmm2[5],zmm3[5],zmm2[7],zmm3[7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,7,15] +; AVX512-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [7,15] +; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 -; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm12 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,0,4,12] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm15 = [4,12] +; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,5,13] +; AVX512-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,0,8] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,2,10] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX512-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,3,11] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512-NEXT: vzeroupper @@ -1758,104 +1729,99 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 (%r11), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%r11), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: movb $-64, %r8b ; AVX512-FCP-NEXT: kmovw %r8d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm10 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm4, %ymm11 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm2[1],zmm3[1],zmm2[3],zmm3[3],zmm2[5],zmm3[5],zmm2[7],zmm3[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [7,15] +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,0,4,12] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm15 = [4,12] +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,5,13] +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,0,8] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,3,11] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper @@ -1867,104 +1833,99 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%r11), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%r11), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,1,9] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: movb $-64, %r8b ; AVX512DQ-NEXT: kmovw %r8d, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm10 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm4, %ymm11 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,14] +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm2[1],zmm3[1],zmm2[3],zmm3[3],zmm2[5],zmm3[5],zmm2[7],zmm3[7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,7,15] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [7,15] +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm12 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,0,4,12] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm15 = [4,12] +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,5,13] +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,0,8] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,2,10] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,3,11] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-NEXT: vzeroupper @@ -1976,104 +1937,99 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r11), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r11), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: movb $-64, %r8b ; AVX512DQ-FCP-NEXT: kmovw %r8d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm4, %ymm11 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm2[1],zmm3[1],zmm2[3],zmm3[3],zmm2[5],zmm3[5],zmm2[7],zmm3[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [7,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm15 = [4,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,0,8] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,3,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -2085,104 +2041,99 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,1,9] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-64, %r8b ; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm10 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm4, %ymm11 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,6,14] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,14] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm2[1],zmm3[1],zmm2[3],zmm3[3],zmm2[5],zmm3[5],zmm2[7],zmm3[7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,7,15] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm14 = [7,15] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm12 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,0,4,12] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm15 = [4,12] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,5,13] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,0,8] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,2,10] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,3,11] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512BW-NEXT: vzeroupper @@ -2194,104 +2145,99 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-64, %r8b ; AVX512BW-FCP-NEXT: kmovd %r8d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm4, %ymm11 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm2[1],zmm3[1],zmm2[3],zmm3[3],zmm2[5],zmm3[5],zmm2[7],zmm3[7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [7,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm15 = [4,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,0,8] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,3,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -2303,104 +2249,99 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-64, %r8b ; AVX512DQ-BW-NEXT: kmovd %r8d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm4, %ymm11 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm2[1],zmm3[1],zmm2[3],zmm3[3],zmm2[5],zmm3[5],zmm2[7],zmm3[7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm14 = [7,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm12 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm15 = [4,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -2412,104 +2353,99 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %r8b ; AVX512DQ-BW-FCP-NEXT: kmovd %r8d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm4, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm2[1],zmm3[1],zmm2[3],zmm3[3],zmm2[5],zmm3[5],zmm2[7],zmm3[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm15 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -3943,1593 +3879,1617 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride8_vf16: ; AVX512: # %bb.0: +; AVX512-NEXT: subq $72, %rsp ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm27 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512-NEXT: vmovdqa64 64(%r11), %zmm3 -; AVX512-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 -; AVX512-NEXT: movb $-64, %r8b -; AVX512-NEXT: kmovw %r8d, %k1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 -; AVX512-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 -; AVX512-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512-NEXT: vmovdqa64 64(%rcx), %ymm23 -; AVX512-NEXT: vmovdqa64 (%rdx), %ymm24 -; AVX512-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] -; AVX512-NEXT: vmovdqa64 (%rsi), %ymm26 -; AVX512-NEXT: vmovdqa64 64(%rsi), %ymm27 -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512-NEXT: vmovdqa64 (%r10), %zmm7 +; AVX512-NEXT: vmovdqa64 64(%r10), %zmm6 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,1,9] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm1 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512-NEXT: movb $-64, %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,0,8] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512-NEXT: vpermt2q %zmm24, %zmm18, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm24[1],zmm29[3],zmm24[3],zmm29[5],zmm24[5],zmm29[7],zmm24[7] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-NEXT: vpermt2q %zmm23, %zmm1, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm5 = [5,13] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-NEXT: vpermt2q %zmm10, %zmm26, %zmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm29[0],zmm24[0],zmm29[2],zmm24[2],zmm29[4],zmm24[4],zmm29[6],zmm24[6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2q %zmm23, %zmm9, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [4,12] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512-NEXT: vpermt2q %zmm24, %zmm22, %zmm13 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm7[1],zmm10[1],zmm7[3],zmm10[3],zmm7[5],zmm10[5],zmm7[7],zmm10[7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,7,15] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-NEXT: vpermt2q %zmm23, %zmm12, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,15] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm31 = [0,0,6,14] +; AVX512-NEXT: vpermt2q %zmm23, %zmm31, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm23 = [6,14] +; AVX512-NEXT: vpermt2q %zmm8, %zmm23, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512-NEXT: vpermt2q %zmm24, %zmm27, %zmm8 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm13 +; AVX512-NEXT: vpermi2q %zmm19, %zmm6, %zmm17 +; AVX512-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm14 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512-NEXT: vpermi2q %zmm19, %zmm6, %zmm20 +; AVX512-NEXT: vpermi2q %zmm30, %zmm16, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512-NEXT: vpermi2q %zmm19, %zmm6, %zmm21 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512-NEXT: vpermi2q %zmm20, %zmm0, %zmm1 +; AVX512-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm5 +; AVX512-NEXT: vpermi2q %zmm19, %zmm6, %zmm26 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512-NEXT: vpermi2q %zmm20, %zmm0, %zmm9 +; AVX512-NEXT: vpermi2q %zmm8, %zmm15, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm17 +; AVX512-NEXT: vpermi2q %zmm20, %zmm0, %zmm12 +; AVX512-NEXT: vpermi2q %zmm8, %zmm15, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm30, %zmm16, %zmm22 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm6[1],zmm19[1],zmm6[3],zmm19[3],zmm6[5],zmm19[5],zmm6[7],zmm19[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512-NEXT: vpermi2q %zmm30, %zmm16, %zmm27 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512-NEXT: vpermt2q %zmm8, %zmm23, %zmm15 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm10, %zmm4, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,2,10] +; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm7 +; AVX512-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512-NEXT: vpermi2q %zmm19, %zmm6, %zmm4 +; AVX512-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX512-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512-NEXT: vpermi2q %zmm30, %zmm16, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512-NEXT: vpermt2q %zmm30, %zmm2, %zmm16 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, (%rax) +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride8_vf16: ; AVX512-FCP: # %bb.0: +; AVX512-FCP-NEXT: subq $72, %rsp ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 -; AVX512-FCP-NEXT: movb $-64, %r8b -; AVX512-FCP-NEXT: kmovw %r8d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512-FCP-NEXT: movb $-64, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,0,8] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm18, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm24[1],zmm29[3],zmm24[3],zmm29[5],zmm24[5],zmm29[7],zmm24[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm26, %zmm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm29[0],zmm24[0],zmm29[2],zmm24[2],zmm29[4],zmm24[4],zmm29[6],zmm24[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm13 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm7[1],zmm10[1],zmm7[3],zmm10[3],zmm7[5],zmm10[5],zmm7[7],zmm10[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,7,15] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,15] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [0,0,6,14] +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm23 = [6,14] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm17 +; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm21 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm26 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm17 +; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm22 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm6[1],zmm19[1],zmm6[3],zmm19[3],zmm6[5],zmm19[5],zmm6[7],zmm19[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm27 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm7 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 768(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-FCP-NEXT: addq $72, %rsp ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride8_vf16: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: subq $72, %rsp ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 64(%r11), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 -; AVX512DQ-NEXT: movb $-64, %r8b -; AVX512DQ-NEXT: kmovw %r8d, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQ-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %ymm23 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm24 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %ymm26 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm27 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm17, %zmm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQ-NEXT: movb $-64, %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,0,8] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm20, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm18, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm24[1],zmm29[3],zmm24[3],zmm29[5],zmm24[5],zmm29[7],zmm24[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm1, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm5 = [5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm26, %zmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm29[0],zmm24[0],zmm29[2],zmm24[2],zmm29[4],zmm24[4],zmm29[6],zmm24[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm22, %zmm13 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm7[1],zmm10[1],zmm7[3],zmm10[3],zmm7[5],zmm10[5],zmm7[7],zmm10[7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,7,15] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm12, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,15] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm25 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm31 = [0,0,6,14] +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm31, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm23 = [6,14] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm23, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm27, %zmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm6, %zmm17 +; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm6, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm16, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm6, %zmm21 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm6, %zmm26 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm15, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm17 +; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm15, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm16, %zmm22 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm6[1],zmm19[1],zmm6[3],zmm19[3],zmm6[5],zmm19[5],zmm6[7],zmm19[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm16, %zmm27 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm23, %zmm15 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,2,10] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm7 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm16, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm2, %zmm16 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-NEXT: addq $72, %rsp ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride8_vf16: ; AVX512DQ-FCP: # %bb.0: +; AVX512DQ-FCP-NEXT: subq $72, %rsp ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 -; AVX512DQ-FCP-NEXT: movb $-64, %r8b -; AVX512DQ-FCP-NEXT: kmovw %r8d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: movb $-64, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,0,8] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm18, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm24[1],zmm29[3],zmm24[3],zmm29[5],zmm24[5],zmm29[7],zmm24[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm29[0],zmm24[0],zmm29[2],zmm24[2],zmm29[4],zmm24[4],zmm29[6],zmm24[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm13 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm7[1],zmm10[1],zmm7[3],zmm10[3],zmm7[5],zmm10[5],zmm7[7],zmm10[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm23 = [6,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm17 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm21 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm26 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm17 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm22 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm6[1],zmm19[1],zmm6[3],zmm19[3],zmm6[5],zmm19[5],zmm6[7],zmm19[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm27 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $72, %rsp ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride8_vf16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $72, %rsp ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 -; AVX512BW-NEXT: movb $-64, %r8b -; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %ymm23 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm24 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm26 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm27 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BW-FCP-LABEL: store_i64_stride8_vf16: -; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 -; AVX512BW-FCP-NEXT: movb $-64, %r8b -; AVX512BW-FCP-NEXT: kmovd %r8d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm1 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,0,8] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm24[1],zmm29[3],zmm24[3],zmm29[5],zmm24[5],zmm29[7],zmm24[7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm29[0],zmm24[0],zmm29[2],zmm24[2],zmm29[4],zmm24[4],zmm29[6],zmm24[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm22, %zmm13 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm7[1],zmm10[1],zmm7[3],zmm10[3],zmm7[5],zmm10[5],zmm7[7],zmm10[7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,7,15] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,15] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm25 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [0,0,6,14] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm31, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm23 = [6,14] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm21 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm15, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm15, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm22 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm6[1],zmm19[1],zmm6[3],zmm19[3],zmm6[5],zmm19[5],zmm6[7],zmm19[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,2,10] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm7 +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: addq $72, %rsp +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BW-FCP-LABEL: store_i64_stride8_vf16: +; AVX512BW-FCP: # %bb.0: +; AVX512BW-FCP-NEXT: subq $72, %rsp +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: movb $-64, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,0,8] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm18, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm24[1],zmm29[3],zmm24[3],zmm29[5],zmm24[5],zmm29[7],zmm24[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm26, %zmm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm29[0],zmm24[0],zmm29[2],zmm24[2],zmm29[4],zmm24[4],zmm29[6],zmm24[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm13 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm7[1],zmm10[1],zmm7[3],zmm10[3],zmm7[5],zmm10[5],zmm7[7],zmm10[7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm23 = [6,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm17 +; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm26 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm22 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm6[1],zmm19[1],zmm6[3],zmm19[3],zmm6[5],zmm19[5],zmm6[7],zmm19[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm27 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX512BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm16 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 768(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-FCP-NEXT: addq $72, %rsp ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride8_vf16: ; AVX512DQ-BW: # %bb.0: +; AVX512DQ-BW-NEXT: subq $72, %rsp ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r11), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 -; AVX512DQ-BW-NEXT: movb $-64, %r8b -; AVX512DQ-BW-NEXT: kmovd %r8d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %ymm23 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm24 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm26 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %ymm27 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: movb $-64, %al +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm24[1],zmm29[3],zmm24[3],zmm29[5],zmm24[5],zmm29[7],zmm24[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm29[0],zmm24[0],zmm29[2],zmm24[2],zmm29[4],zmm24[4],zmm29[6],zmm24[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm22, %zmm13 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm7[1],zmm10[1],zmm7[3],zmm10[3],zmm7[5],zmm10[5],zmm7[7],zmm10[7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm23 = [6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm17 +; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm21 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm26 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm15, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm22 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm6[1],zmm19[1],zmm6[3],zmm19[3],zmm6[5],zmm19[5],zmm6[7],zmm19[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm27 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm15 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX512DQ-BW-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm16, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm16 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 768(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: addq $72, %rsp ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf16: ; AVX512DQ-BW-FCP: # %bb.0: +; AVX512DQ-BW-FCP-NEXT: subq $72, %rsp ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 -; AVX512DQ-BW-FCP-NEXT: movb $-64, %r8b -; AVX512DQ-BW-FCP-NEXT: kmovd %r8d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: movb $-64, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm18, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm24[1],zmm29[3],zmm24[3],zmm29[5],zmm24[5],zmm29[7],zmm24[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm26, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm29[0],zmm24[0],zmm29[2],zmm24[2],zmm29[4],zmm24[4],zmm29[6],zmm24[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm7[1],zmm10[1],zmm7[3],zmm10[3],zmm7[5],zmm10[5],zmm7[7],zmm10[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm23 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm6[1],zmm19[1],zmm6[3],zmm19[3],zmm6[5],zmm19[5],zmm6[7],zmm19[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm16, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: addq $72, %rsp ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -8567,451 +8527,455 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride8_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovaps 128(%rdx), %zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512-NEXT: vmovdqa64 (%r10), %zmm15 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512-NEXT: movb $-64, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm24[0],zmm28[0],zmm24[2],zmm28[2],zmm24[4],zmm28[4],zmm24[6],zmm28[6] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,12] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512-NEXT: vpermt2q %zmm19, %zmm18, %zmm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm24[1],zmm28[1],zmm24[3],zmm28[3],zmm24[5],zmm28[5],zmm24[7],zmm28[7] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,13] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512-NEXT: vpermt2q %zmm28, %zmm20, %zmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm5 +; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm9 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm1, %zmm14, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm14, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm14, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%r10), %zmm27 +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm26[0],zmm16[2],zmm26[2],zmm16[4],zmm26[4],zmm16[6],zmm26[6] +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm6 = [6,14] +; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512-NEXT: vpermt2q %zmm26, %zmm20, %zmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm1[0],zmm27[2],zmm1[2],zmm27[4],zmm1[4],zmm27[6],zmm1[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm27[1],zmm1[1],zmm27[3],zmm1[3],zmm27[5],zmm1[5],zmm27[7],zmm1[7] ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,0,8] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm31 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,0,1,9] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,2,10] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512-NEXT: vpermt2q %zmm19, %zmm4, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm27 ; AVX512-NEXT: vmovdqa64 128(%r10), %zmm19 -; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 -; AVX512-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512-NEXT: vmovdqa64 128(%rax), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm30 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] -; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 -; AVX512-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%r8), %zmm14 +; AVX512-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm19[1],zmm1[1],zmm19[3],zmm1[3],zmm19[5],zmm1[5],zmm19[7],zmm1[7] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512-NEXT: vmovdqa64 192(%r10), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 -; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 -; AVX512-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512-NEXT: vpermi2q %zmm1, %zmm10, %zmm21 +; AVX512-NEXT: vpermi2q %zmm1, %zmm10, %zmm23 +; AVX512-NEXT: vpermi2q %zmm1, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%r8), %zmm5 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] -; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm20 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm10[0],zmm1[0],zmm10[2],zmm1[2],zmm10[4],zmm1[4],zmm10[6],zmm1[6] +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm22 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm10[1],zmm1[1],zmm10[3],zmm1[3],zmm10[5],zmm1[5],zmm10[7],zmm1[7] +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm10, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm24 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512-NEXT: vpermt2q %zmm28, %zmm1, %zmm25 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512-NEXT: vpermt2q %zmm28, %zmm8, %zmm20 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm28 +; AVX512-NEXT: vpermt2q %zmm26, %zmm12, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm26 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm14[1],zmm3[1],zmm14[3],zmm3[3],zmm14[5],zmm3[5],zmm14[7],zmm3[7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 +; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm5[1],zmm0[1],zmm5[3],zmm0[3],zmm5[5],zmm0[5],zmm5[7],zmm0[7] +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm12, %ymm12 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX512-NEXT: vinserti128 $1, 128(%rdx), %ymm9, %ymm9 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm31 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm30 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa 128(%rsi), %xmm3 -; AVX512-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm24 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} -; AVX512-NEXT: vmovdqa 192(%rsi), %xmm4 -; AVX512-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 -; AVX512-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm22, %zmm12 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512-NEXT: vmovdqa 128(%rdx), %ymm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm20[0],ymm18[0],ymm20[2],ymm18[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm18[1],ymm20[3],ymm18[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm13 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512-NEXT: vmovdqa64 128(%rsi), %ymm17 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vmovdqa 192(%rcx), %ymm14 ; AVX512-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 1216(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 1216(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 1152(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 1984(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 1920(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 1856(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 1792(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 1600(%rax) -; AVX512-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512-NEXT: vmovaps %zmm11, 1984(%rax) +; AVX512-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 1792(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 1600(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512-NEXT: vmovdqa64 %zmm31, 1344(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9020,8 +8984,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9034,457 +9000,461 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, (%rax) -; AVX512-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride8_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512-FCP-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512-FCP-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovaps 128(%rdx), %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512-FCP-NEXT: movb $-64, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm24[0],zmm28[0],zmm24[2],zmm28[2],zmm24[4],zmm28[4],zmm24[6],zmm28[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm24[1],zmm28[1],zmm24[3],zmm28[3],zmm24[5],zmm28[5],zmm24[7],zmm28[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm5 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm26[0],zmm16[2],zmm26[2],zmm16[4],zmm26[4],zmm16[6],zmm26[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [6,14] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm1[0],zmm27[2],zmm1[2],zmm27[4],zmm1[4],zmm27[6],zmm1[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm27[1],zmm1[1],zmm27[3],zmm1[3],zmm27[5],zmm1[5],zmm27[7],zmm1[7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,0,8] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm31 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm19[1],zmm1[1],zmm19[3],zmm1[3],zmm19[5],zmm1[5],zmm19[7],zmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 192(%r10), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm23 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm20 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm10[0],zmm1[0],zmm10[2],zmm1[2],zmm10[4],zmm1[4],zmm10[6],zmm1[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm22 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm10[1],zmm1[1],zmm10[3],zmm1[3],zmm10[5],zmm1[5],zmm10[7],zmm1[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm25 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm12, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm26 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm14[1],zmm3[1],zmm14[3],zmm3[3],zmm14[5],zmm3[5],zmm14[7],zmm3[7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm5[1],zmm0[1],zmm5[3],zmm0[3],zmm5[5],zmm0[5],zmm5[7],zmm0[7] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm12, %ymm12 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm31 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm30 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm22, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm20[0],ymm18[0],ymm20[2],ymm18[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm18[1],ymm20[3],ymm18[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %ymm17 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqa 192(%rcx), %ymm14 ; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 1216(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1152(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 1600(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm11, 1984(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 1792(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 1600(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 1344(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9493,8 +9463,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9507,457 +9479,461 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512-FCP-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512-FCP-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride8_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512DQ-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovaps 128(%rdx), %zmm0 +; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512DQ-NEXT: movb $-64, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm24[0],zmm28[0],zmm24[2],zmm28[2],zmm24[4],zmm28[4],zmm24[6],zmm28[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm18, %zmm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm24[1],zmm28[1],zmm24[3],zmm28[3],zmm24[5],zmm28[5],zmm24[7],zmm28[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm20, %zmm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm14, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm14, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm26[0],zmm16[2],zmm26[2],zmm16[4],zmm26[4],zmm16[6],zmm26[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm6 = [6,14] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm20, %zmm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm1[0],zmm27[2],zmm1[2],zmm27[4],zmm1[4],zmm27[6],zmm1[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm27[1],zmm1[1],zmm27[3],zmm1[3],zmm27[5],zmm1[5],zmm27[7],zmm1[7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,0,8] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm31 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm4, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm30 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm19[1],zmm1[1],zmm19[3],zmm1[3],zmm19[5],zmm1[5],zmm19[7],zmm1[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 192(%r10), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm10, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm10, %zmm23 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm20 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm10[0],zmm1[0],zmm10[2],zmm1[2],zmm10[4],zmm1[4],zmm10[6],zmm1[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm22 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm10[1],zmm1[1],zmm10[3],zmm1[3],zmm10[5],zmm1[5],zmm10[7],zmm1[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm10, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm24 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm1, %zmm25 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm8, %zmm20 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm12, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm26 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm14[1],zmm3[1],zmm14[3],zmm3[3],zmm14[5],zmm3[5],zmm14[7],zmm3[7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm5[1],zmm0[1],zmm5[3],zmm0[3],zmm5[5],zmm0[5],zmm5[7],zmm0[7] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm12, %ymm12 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX512DQ-NEXT: vinserti128 $1, 128(%rdx), %ymm9, %ymm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm31 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm30 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa 128(%rsi), %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm24 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%rsi), %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm22, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQ-NEXT: vmovdqa 128(%rdx), %ymm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm20[0],ymm18[0],ymm20[2],ymm18[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm18[1],ymm20[3],ymm18[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %ymm17 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqa 192(%rcx), %ymm14 ; AVX512DQ-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1216(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1216(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1152(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1984(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1920(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1856(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1792(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1600(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-NEXT: vmovaps %zmm11, 1984(%rax) +; AVX512DQ-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1792(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1600(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1344(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512DQ-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9966,8 +9942,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9980,457 +9958,461 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512DQ-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride8_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512DQ-FCP-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovaps 128(%rdx), %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512DQ-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm24[0],zmm28[0],zmm24[2],zmm28[2],zmm24[4],zmm28[4],zmm24[6],zmm28[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm24[1],zmm28[1],zmm24[3],zmm28[3],zmm24[5],zmm28[5],zmm24[7],zmm28[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm26[0],zmm16[2],zmm26[2],zmm16[4],zmm26[4],zmm16[6],zmm26[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [6,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm1[0],zmm27[2],zmm1[2],zmm27[4],zmm1[4],zmm27[6],zmm1[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm27[1],zmm1[1],zmm27[3],zmm1[3],zmm27[5],zmm1[5],zmm27[7],zmm1[7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,0,8] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm31 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm19[1],zmm1[1],zmm19[3],zmm1[3],zmm19[5],zmm1[5],zmm19[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r10), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm23 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm20 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm10[0],zmm1[0],zmm10[2],zmm1[2],zmm10[4],zmm1[4],zmm10[6],zmm1[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm22 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm10[1],zmm1[1],zmm10[3],zmm1[3],zmm10[5],zmm1[5],zmm10[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm25 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm12, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm26 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm14[1],zmm3[1],zmm14[3],zmm3[3],zmm14[5],zmm3[5],zmm14[7],zmm3[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm5[1],zmm0[1],zmm5[3],zmm0[3],zmm5[5],zmm0[5],zmm5[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm22, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm20[0],ymm18[0],ymm20[2],ymm18[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm18[1],ymm20[3],ymm18[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rcx), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1216(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1152(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 1600(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 1984(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 1792(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 1600(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 1344(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10439,8 +10421,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10453,457 +10437,461 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FCP-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512DQ-FCP-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512BW-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm24[0],zmm28[0],zmm24[2],zmm28[2],zmm24[4],zmm28[4],zmm24[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm24[1],zmm28[1],zmm24[3],zmm28[3],zmm24[5],zmm28[5],zmm24[7],zmm28[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm14, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm26[0],zmm16[2],zmm26[2],zmm16[4],zmm26[4],zmm16[6],zmm26[6] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm6 = [6,14] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm1[0],zmm27[2],zmm1[2],zmm27[4],zmm1[4],zmm27[6],zmm1[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm27[1],zmm1[1],zmm27[3],zmm1[3],zmm27[5],zmm1[5],zmm27[7],zmm1[7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,0,8] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm31 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,2,10] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm27 ; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm19 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 -; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm14 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm19[1],zmm1[1],zmm19[3],zmm1[3],zmm19[5],zmm1[5],zmm19[7],zmm1[7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm10[0],zmm1[0],zmm10[2],zmm1[2],zmm10[4],zmm1[4],zmm10[6],zmm1[6] +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm22 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm10[1],zmm1[1],zmm10[3],zmm1[3],zmm10[5],zmm1[5],zmm10[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm24 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm25 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm20 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm14[1],zmm3[1],zmm14[3],zmm3[3],zmm14[5],zmm3[5],zmm14[7],zmm3[7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm5[1],zmm0[1],zmm5[3],zmm0[3],zmm5[5],zmm0[5],zmm5[7],zmm0[7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm12, %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm9, %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm30 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm22, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm20[0],ymm18[0],ymm20[2],ymm18[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm18[1],ymm20[3],ymm18[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm17 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm14 ; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1152(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1984(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm11, 1984(%rax) +; AVX512BW-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1344(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10912,8 +10900,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10926,457 +10916,461 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512BW-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride8_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512BW-FCP-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-FCP-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovaps 128(%rdx), %zmm0 +; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512BW-FCP-NEXT: movb $-64, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm5 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm24[0],zmm28[0],zmm24[2],zmm28[2],zmm24[4],zmm28[4],zmm24[6],zmm28[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm24[1],zmm28[1],zmm24[3],zmm28[3],zmm24[5],zmm28[5],zmm24[7],zmm28[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm5 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm26[0],zmm16[2],zmm26[2],zmm16[4],zmm26[4],zmm16[6],zmm26[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [6,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm1[0],zmm27[2],zmm1[2],zmm27[4],zmm1[4],zmm27[6],zmm1[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm27[1],zmm1[1],zmm27[3],zmm1[3],zmm27[5],zmm1[5],zmm27[7],zmm1[7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,0,8] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm31 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm19[1],zmm1[1],zmm19[3],zmm1[3],zmm19[5],zmm1[5],zmm19[7],zmm1[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm20 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm10[0],zmm1[0],zmm10[2],zmm1[2],zmm10[4],zmm1[4],zmm10[6],zmm1[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm22 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm10[1],zmm1[1],zmm10[3],zmm1[3],zmm10[5],zmm1[5],zmm10[7],zmm1[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm24 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm12, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm26 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm14[1],zmm3[1],zmm14[3],zmm3[3],zmm14[5],zmm3[5],zmm14[7],zmm3[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm5[1],zmm0[1],zmm5[3],zmm0[3],zmm5[5],zmm0[5],zmm5[7],zmm0[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm12, %ymm12 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 -; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm22, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm20[0],ymm18[0],ymm20[2],ymm18[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm18[1],ymm20[3],ymm18[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm17 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 192(%rcx), %ymm14 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 1216(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1152(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 1600(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm11, 1984(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 1792(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 1600(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 1344(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11385,8 +11379,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11399,457 +11395,461 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-FCP-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512BW-FCP-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride8_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512DQ-BW-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-BW-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovaps 128(%rdx), %zmm0 +; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512DQ-BW-NEXT: movb $-64, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm5 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm24[0],zmm28[0],zmm24[2],zmm28[2],zmm24[4],zmm28[4],zmm24[6],zmm28[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm24[1],zmm28[1],zmm24[3],zmm28[3],zmm24[5],zmm28[5],zmm24[7],zmm28[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm5 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm14, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm26[0],zmm16[2],zmm26[2],zmm16[4],zmm26[4],zmm16[6],zmm26[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm6 = [6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm1[0],zmm27[2],zmm1[2],zmm27[4],zmm1[4],zmm27[6],zmm1[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm27[1],zmm1[1],zmm27[3],zmm1[3],zmm27[5],zmm1[5],zmm27[7],zmm1[7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm31 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm19[1],zmm1[1],zmm19[3],zmm1[3],zmm19[5],zmm1[5],zmm19[7],zmm1[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r10), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm20 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm10[0],zmm1[0],zmm10[2],zmm1[2],zmm10[4],zmm1[4],zmm10[6],zmm1[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm22 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm10[1],zmm1[1],zmm10[3],zmm1[3],zmm10[5],zmm1[5],zmm10[7],zmm1[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm24 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm25 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm26 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm14[1],zmm3[1],zmm14[3],zmm3[3],zmm14[5],zmm3[5],zmm14[7],zmm3[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm5[1],zmm0[1],zmm5[3],zmm0[3],zmm5[5],zmm0[5],zmm5[7],zmm0[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm12, %ymm12 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdx), %ymm9, %ymm9 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %xmm4 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm22, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %ymm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm20[0],ymm18[0],ymm20[2],ymm18[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm18[1],ymm20[3],ymm18[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 192(%rcx), %ymm14 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1216(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 1216(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1152(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 1984(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 1920(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1856(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1792(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1600(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm11, 1984(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 1792(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 1600(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1344(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11858,8 +11858,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11872,457 +11874,461 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-BW-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512DQ-BW-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512DQ-BW-FCP-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm24[0],zmm28[0],zmm24[2],zmm28[2],zmm24[4],zmm28[4],zmm24[6],zmm28[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm24[1],zmm28[1],zmm24[3],zmm28[3],zmm24[5],zmm28[5],zmm24[7],zmm28[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm14, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm26[0],zmm16[2],zmm26[2],zmm16[4],zmm26[4],zmm16[6],zmm26[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm26[1],zmm16[3],zmm26[3],zmm16[5],zmm26[5],zmm16[7],zmm26[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm1[0],zmm27[2],zmm1[2],zmm27[4],zmm1[4],zmm27[6],zmm1[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm27[1],zmm1[1],zmm27[3],zmm1[3],zmm27[5],zmm1[5],zmm27[7],zmm1[7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm19[1],zmm1[1],zmm19[3],zmm1[3],zmm19[5],zmm1[5],zmm19[7],zmm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm10[0],zmm1[0],zmm10[2],zmm1[2],zmm10[4],zmm1[4],zmm10[6],zmm1[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm10[1],zmm1[1],zmm10[3],zmm1[3],zmm10[5],zmm1[5],zmm10[7],zmm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm12, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm14[1],zmm3[1],zmm14[3],zmm3[3],zmm14[5],zmm3[5],zmm14[7],zmm3[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm5[1],zmm0[1],zmm5[3],zmm0[3],zmm5[5],zmm0[5],zmm5[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm12, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm22, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm20[0],ymm18[0],ymm20[2],ymm18[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm18[1],ymm20[3],ymm18[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rcx), %ymm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1216(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1152(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 1600(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 1984(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm3, 1920(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1856(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 1792(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 1600(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 1472(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 1408(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 1344(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12331,8 +12337,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12345,7 +12353,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512DQ-BW-FCP-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -18615,7577 +18623,7953 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride8_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512-NEXT: subq $5704, %rsp # imm = 0x1648 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm12 -; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512-NEXT: vmovdqa64 128(%r8), %zmm11 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512-NEXT: vmovdqa64 (%r10), %zmm26 -; AVX512-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512-NEXT: vmovdqa64 128(%r10), %zmm16 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512-NEXT: vmovdqa64 (%r10), %zmm16 +; AVX512-NEXT: vmovdqa64 64(%r10), %zmm24 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512-NEXT: movb $-64, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,12] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm10, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm17[1],zmm28[1],zmm17[3],zmm28[3],zmm17[5],zmm28[5],zmm17[7],zmm28[7] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm28, %zmm18, %zmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm16[0],zmm21[0],zmm16[2],zmm21[2],zmm16[4],zmm21[4],zmm16[6],zmm21[6] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm11 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm21[1],zmm16[3],zmm21[3],zmm16[5],zmm21[5],zmm16[7],zmm21[7] +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm17 = [0,0,7,15] +; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm26 = [7,15] +; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512-NEXT: vpermt2q %zmm22, %zmm30, %zmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512-NEXT: vpermt2q %zmm22, %zmm27, %zmm7 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm2[1],zmm20[1],zmm2[3],zmm20[3],zmm2[5],zmm20[5],zmm2[7],zmm20[7] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,5,13] +; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm18, %zmm8 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm24[0],zmm22[0],zmm24[2],zmm22[2],zmm24[4],zmm22[4],zmm24[6],zmm22[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%r8), %zmm12 +; AVX512-NEXT: vpermt2q %zmm6, %zmm17, %zmm5 +; AVX512-NEXT: vmovdqa64 128(%r10), %zmm31 +; AVX512-NEXT: vpermt2q %zmm4, %zmm26, %zmm3 +; AVX512-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 -; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm9, %zmm29, %zmm4 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm24[1],zmm22[1],zmm24[3],zmm22[3],zmm24[5],zmm22[5],zmm24[7],zmm22[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-NEXT: vpermt2q %zmm20, %zmm30, %zmm3 +; AVX512-NEXT: vmovdqa64 128(%r9), %zmm23 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512-NEXT: vpermt2q %zmm20, %zmm27, %zmm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm12[1],zmm23[1],zmm12[3],zmm23[3],zmm12[5],zmm23[5],zmm12[7],zmm23[7] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm23, %zmm18, %zmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm20[0],zmm31[2],zmm20[2],zmm31[4],zmm20[4],zmm31[6],zmm20[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm31[1],zmm20[1],zmm31[3],zmm20[3],zmm31[5],zmm20[5],zmm31[7],zmm20[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 +; AVX512-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm10[0],zmm0[2],zmm10[2],zmm0[4],zmm10[4],zmm0[6],zmm10[6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,12] +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512-NEXT: vpermt2q %zmm19, %zmm27, %zmm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm10[1],zmm0[3],zmm10[3],zmm0[5],zmm10[5],zmm0[7],zmm10[7] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,13] +; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm1 -; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 256(%r10), %zmm13 -; AVX512-NEXT: vmovdqa64 256(%rax), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 -; AVX512-NEXT: vmovdqa64 256(%r8), %zmm18 -; AVX512-NEXT: vmovdqa64 256(%r9), %zmm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm23[0],zmm19[0],zmm23[2],zmm19[2],zmm23[4],zmm19[4],zmm23[6],zmm19[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vpermt2q %zmm10, %zmm29, %zmm4 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm23[1],zmm19[1],zmm23[3],zmm19[3],zmm23[5],zmm19[5],zmm23[7],zmm19[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 256(%r10), %zmm18 +; AVX512-NEXT: vmovdqa64 256(%rax), %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpermt2q %zmm13, %zmm30, %zmm3 +; AVX512-NEXT: vmovdqa64 256(%r8), %zmm1 +; AVX512-NEXT: vmovdqa64 256(%r9), %zmm25 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512-NEXT: vpermt2q %zmm13, %zmm27, %zmm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm1[1],zmm25[1],zmm1[3],zmm25[3],zmm1[5],zmm25[5],zmm1[7],zmm25[7] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqa64 320(%r10), %zmm16 -; AVX512-NEXT: vmovdqa64 320(%rax), %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 -; AVX512-NEXT: vmovdqa64 320(%r8), %zmm29 -; AVX512-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm7, %zmm17, %zmm3 +; AVX512-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm25, %zmm29, %zmm4 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm13[1],zmm18[3],zmm13[3],zmm18[5],zmm13[5],zmm18[7],zmm13[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 +; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm17, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 -; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 320(%r10), %zmm6 +; AVX512-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm5 +; AVX512-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,5,13] +; AVX512-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512-NEXT: vpermt2q %zmm9, %zmm10, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512-NEXT: vpermt2q %zmm9, %zmm8, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,6,14,0,0] +; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,7,15,0,0] +; AVX512-NEXT: vpermt2q %zmm17, %zmm9, %zmm4 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm6[1],zmm0[1],zmm6[3],zmm0[3],zmm6[5],zmm0[5],zmm6[7],zmm0[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm3 +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 -; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,0,8] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,9] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,3,11] +; AVX512-NEXT: vpermt2q %zmm21, %zmm5, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512-NEXT: vpermt2q %zmm22, %zmm5, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm5, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512-NEXT: vmovdqa64 384(%r10), %zmm19 +; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqa64 384(%r8), %zmm12 +; AVX512-NEXT: vmovdqa64 384(%r9), %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm20 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm18 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm30 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa64 448(%r8), %zmm9 +; AVX512-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm11 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm28 -; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 -; AVX512-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm16 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,8,0,0] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512-NEXT: vpermt2q %zmm28, %zmm15, %zmm30 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512-NEXT: vpermt2q %zmm28, %zmm14, %zmm29 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-NEXT: vpermt2q %zmm23, %zmm5, %zmm10 +; AVX512-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm13 +; AVX512-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-NEXT: vpermt2q %zmm17, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-NEXT: vpermt2q %zmm17, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm25 +; AVX512-NEXT: vpermt2q %zmm17, %zmm6, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 -; AVX512-NEXT: vmovdqa64 384(%r10), %zmm13 -; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 -; AVX512-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 -; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] -; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512-NEXT: vmovdqa64 448(%rax), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm12[1],zmm4[1],zmm12[3],zmm4[3],zmm12[5],zmm4[5],zmm12[7],zmm4[7] +; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm15 +; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[4],zmm9[6],zmm3[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm9[1],zmm3[1],zmm9[3],zmm3[3],zmm9[5],zmm3[5],zmm9[7],zmm3[7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 -; AVX512-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 -; AVX512-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-NEXT: vmovdqa 128(%rsi), %ymm4 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 +; AVX512-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 +; AVX512-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 +; AVX512-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm20 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512-NEXT: vmovdqa 320(%rcx), %ymm0 -; AVX512-NEXT: vmovdqa 320(%rdx), %ymm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512-NEXT: vmovdqa 320(%rdx), %ymm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512-NEXT: vmovdqa 320(%rsi), %ymm4 ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa 384(%rcx), %ymm0 -; AVX512-NEXT: vmovdqa 384(%rdx), %ymm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm13 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512-NEXT: vmovdqa 384(%rdx), %ymm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512-NEXT: vmovdqa 384(%rsi), %ymm4 ; AVX512-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa 448(%rcx), %ymm0 -; AVX512-NEXT: vmovdqa 448(%rdx), %ymm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-NEXT: vmovdqa 448(%rsi), %ymm4 -; AVX512-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm19 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 +; AVX512-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512-NEXT: vmovdqa 448(%rdx), %ymm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm10 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm11 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm8 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm3 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512-NEXT: vinserti128 $1, 128(%rcx), %ymm4, %ymm4 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm4 +; AVX512-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm9 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-NEXT: vmovdqa64 256(%rsi), %xmm16 -; AVX512-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 -; AVX512-NEXT: vmovdqa64 256(%rdi), %xmm17 -; AVX512-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512-NEXT: vmovdqa 256(%rsi), %xmm9 +; AVX512-NEXT: vinserti128 $1, 256(%rcx), %ymm9, %ymm9 +; AVX512-NEXT: vmovdqa64 256(%rdi), %xmm16 +; AVX512-NEXT: vinserti32x4 $1, 256(%rdx), %ymm16, %ymm16 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm17, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512-NEXT: vmovdqa 320(%rsi), %xmm9 +; AVX512-NEXT: vinserti128 $1, 320(%rcx), %ymm9, %ymm9 +; AVX512-NEXT: vmovdqa64 320(%rdi), %xmm16 +; AVX512-NEXT: vinserti32x4 $1, 320(%rdx), %ymm16, %ymm16 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm21, %zmm22, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-NEXT: vmovdqa64 320(%rsi), %xmm16 -; AVX512-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 -; AVX512-NEXT: vmovdqa64 320(%rdi), %xmm17 -; AVX512-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} -; AVX512-NEXT: vmovdqa64 384(%rsi), %xmm17 -; AVX512-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 -; AVX512-NEXT: vmovdqa64 384(%rdi), %xmm20 -; AVX512-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa64 448(%rsi), %xmm20 -; AVX512-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 -; AVX512-NEXT: vmovdqa64 448(%rdi), %xmm26 -; AVX512-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm24 {%k1} +; AVX512-NEXT: vmovdqa64 384(%rsi), %xmm16 +; AVX512-NEXT: vinserti32x4 $1, 384(%rcx), %ymm16, %ymm16 +; AVX512-NEXT: vmovdqa64 384(%rdi), %xmm22 +; AVX512-NEXT: vinserti32x4 $1, 384(%rdx), %ymm22, %ymm22 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm22[0],ymm16[0],ymm22[2],ymm16[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm26 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm22[1],ymm16[1],ymm22[3],ymm16[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm26, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa64 448(%rsi), %xmm22 +; AVX512-NEXT: vinserti32x4 $1, 448(%rcx), %ymm22, %ymm22 +; AVX512-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512-NEXT: vinserti32x4 $1, 448(%rdx), %ymm24, %ymm24 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm22, %zmm14, %zmm14 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 3776(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, 3712(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 3264(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 3200(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512-NEXT: vmovdqa64 %zmm28, 2240(%rax) -; AVX512-NEXT: vmovdqa64 %zmm15, 2176(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 1728(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 1664(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 640(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 192(%rax) -; AVX512-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 4032(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 3968(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 3904(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 3840(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 3648(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 3584(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 3520(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 3456(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 3392(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 3328(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, 3136(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 3072(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 3008(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 2880(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 2624(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 2560(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 2496(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 2432(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 2304(%rax) -; AVX512-NEXT: vmovdqa64 %zmm30, 2112(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 1984(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 1856(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm8, 1792(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 1472(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 1408(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 1344(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 1280(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 1088(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 960(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 896(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 448(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 384(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-NEXT: addq $5384, %rsp # imm = 0x1508 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; -; AVX512-FCP-LABEL: store_i64_stride8_vf64: -; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $5384, %rsp # imm = 0x1508 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 -; AVX512-FCP-NEXT: movb $-64, %r11b -; AVX512-FCP-NEXT: kmovw %r11d, %k1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm19, 3264(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 3200(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 2688(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 2240(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 2176(%rax) +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1664(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 4032(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 3968(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 3904(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 3840(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 3648(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 3584(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 3520(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 3392(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 3072(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 3008(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 2880(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 2624(%rax) +; AVX512-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 2112(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 2048(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 1024(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512-NEXT: addq $5704, %rsp # imm = 0x1648 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; +; AVX512-FCP-LABEL: store_i64_stride8_vf64: +; AVX512-FCP: # %bb.0: +; AVX512-FCP-NEXT: subq $5704, %rsp # imm = 0x1648 +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512-FCP-NEXT: movb $-64, %r11b +; AVX512-FCP-NEXT: kmovw %r11d, %k1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm17[1],zmm28[1],zmm17[3],zmm28[3],zmm17[5],zmm28[5],zmm17[7],zmm28[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm16[0],zmm21[0],zmm16[2],zmm21[2],zmm16[4],zmm21[4],zmm16[6],zmm21[6] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm21[1],zmm16[3],zmm21[3],zmm16[5],zmm21[5],zmm16[7],zmm21[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [0,0,7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm26 = [7,15] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm30, %zmm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm7 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm2[1],zmm20[1],zmm2[3],zmm20[3],zmm2[5],zmm20[5],zmm2[7],zmm20[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm18, %zmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm24[0],zmm22[0],zmm24[2],zmm22[2],zmm24[4],zmm22[4],zmm24[6],zmm22[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm24[1],zmm22[1],zmm24[3],zmm22[3],zmm24[5],zmm22[5],zmm24[7],zmm22[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm23 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm12[1],zmm23[1],zmm12[3],zmm23[3],zmm12[5],zmm23[5],zmm12[7],zmm23[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm20[0],zmm31[2],zmm20[2],zmm31[4],zmm20[4],zmm31[6],zmm20[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm31[1],zmm20[1],zmm31[3],zmm20[3],zmm31[5],zmm20[5],zmm31[7],zmm20[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm10[0],zmm0[2],zmm10[2],zmm0[4],zmm10[4],zmm0[6],zmm10[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,12] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm10[1],zmm0[3],zmm10[3],zmm0[5],zmm10[5],zmm0[7],zmm10[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 256(%r10), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 256(%rax), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm15 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm23[0],zmm19[0],zmm23[2],zmm19[2],zmm23[4],zmm19[4],zmm23[6],zmm19[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm23[1],zmm19[1],zmm23[3],zmm19[3],zmm23[5],zmm19[5],zmm23[7],zmm19[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 256(%r10), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 256(%rax), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm25 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm1[1],zmm25[1],zmm1[3],zmm25[3],zmm1[5],zmm25[5],zmm1[7],zmm25[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 320(%r10), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 320(%rax), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm13[1],zmm18[3],zmm13[3],zmm18[5],zmm13[5],zmm18[7],zmm13[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%r10), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,6,14,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm4 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm6[1],zmm0[1],zmm6[3],zmm0[3],zmm6[5],zmm0[5],zmm6[7],zmm0[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,0,8] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,3,11] +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%r10), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm20 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm18 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm30 ; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,8,0,0] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm30 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm29 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm5, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm12[1],zmm4[1],zmm12[3],zmm4[3],zmm12[5],zmm4[5],zmm12[7],zmm4[7] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm15 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[4],zmm9[6],zmm3[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm9[1],zmm3[1],zmm9[3],zmm3[3],zmm9[5],zmm3[5],zmm9[7],zmm3[7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 384(%r10), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512-FCP-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 +; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512-FCP-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-FCP-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 +; AVX512-FCP-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqa 320(%rcx), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %ymm1 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512-FCP-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %ymm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512-FCP-NEXT: vmovdqa 320(%rsi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa 384(%rcx), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %ymm1 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %ymm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512-FCP-NEXT: vmovdqa 384(%rsi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa 448(%rcx), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %ymm1 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %ymm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm10 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm3 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm9 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512-FCP-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %xmm16 -; AVX512-FCP-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %xmm17 -; AVX512-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqa 256(%rsi), %xmm9 +; AVX512-FCP-NEXT: vinserti128 $1, 256(%rcx), %ymm9, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %xmm16 +; AVX512-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm16, %ymm16 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqa 320(%rsi), %xmm9 +; AVX512-FCP-NEXT: vinserti128 $1, 320(%rcx), %ymm9, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %xmm16 +; AVX512-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm16, %ymm16 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm22, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %xmm16 -; AVX512-FCP-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %xmm17 -; AVX512-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %xmm17 -; AVX512-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %xmm20 -; AVX512-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %xmm20 -; AVX512-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %xmm26 -; AVX512-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %xmm16 +; AVX512-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm16, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %xmm22 +; AVX512-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm22, %ymm22 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm22[0],ymm16[0],ymm22[2],ymm16[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm26 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm22[1],ymm16[1],ymm22[3],ymm16[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm26, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %xmm22 +; AVX512-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm22, %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm24, %ymm24 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm14, %zmm14 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 3776(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 3712(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 3264(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 3200(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 2240(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 2176(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 1728(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 1664(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 640(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rax) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 4032(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 3968(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 3904(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 3840(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 3648(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 3584(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 3520(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 3456(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 3392(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 3328(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 3136(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 3008(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 2880(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 2624(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 2560(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 2496(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 2432(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 2304(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 2112(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 1984(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 1856(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm8, 1792(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 1472(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 1408(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 1344(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 1280(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1088(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 960(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 896(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 448(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 384(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-FCP-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 3264(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 3200(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 2688(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 2240(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 2176(%rax) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1664(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 4032(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 3968(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 3904(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 3840(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 3648(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 3584(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 3520(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 3392(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 3072(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 3008(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 2880(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 2624(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 2112(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 2048(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1024(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512-FCP-NEXT: addq $5704, %rsp # imm = 0x1648 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride8_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512DQ-NEXT: subq $5704, %rsp # imm = 0x1648 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512DQ-NEXT: movb $-64, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm17[1],zmm28[1],zmm17[3],zmm28[3],zmm17[5],zmm28[5],zmm17[7],zmm28[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm18, %zmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm16[0],zmm21[0],zmm16[2],zmm21[2],zmm16[4],zmm21[4],zmm16[6],zmm21[6] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm11 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm21[1],zmm16[3],zmm21[3],zmm16[5],zmm21[5],zmm16[7],zmm21[7] +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm17 = [0,0,7,15] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm26 = [7,15] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm26, %zmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm30, %zmm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm27, %zmm7 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm2[1],zmm20[1],zmm2[3],zmm20[3],zmm2[5],zmm20[5],zmm2[7],zmm20[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,5,13] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm18, %zmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm24[0],zmm22[0],zmm24[2],zmm22[2],zmm24[4],zmm22[4],zmm24[6],zmm22[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm17, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm26, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm29, %zmm4 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm24[1],zmm22[1],zmm24[3],zmm22[3],zmm24[5],zmm22[5],zmm24[7],zmm22[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm30, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm23 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm27, %zmm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm12[1],zmm23[1],zmm12[3],zmm23[3],zmm12[5],zmm23[5],zmm12[7],zmm23[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm18, %zmm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm20[0],zmm31[2],zmm20[2],zmm31[4],zmm20[4],zmm31[6],zmm20[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm31[1],zmm20[1],zmm31[3],zmm20[3],zmm31[5],zmm20[5],zmm31[7],zmm20[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm10[0],zmm0[2],zmm10[2],zmm0[4],zmm10[4],zmm0[6],zmm10[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,12] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm27, %zmm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm10[1],zmm0[3],zmm10[3],zmm0[5],zmm10[5],zmm0[7],zmm10[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,13] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 256(%r10), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 256(%rax), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 256(%r9), %zmm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm23[0],zmm19[0],zmm23[2],zmm19[2],zmm23[4],zmm19[4],zmm23[6],zmm19[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm29, %zmm4 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm23[1],zmm19[1],zmm23[3],zmm19[3],zmm23[5],zmm19[5],zmm23[7],zmm19[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 256(%r10), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 256(%rax), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm30, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 256(%r9), %zmm25 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm27, %zmm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm1[1],zmm25[1],zmm1[3],zmm25[3],zmm1[5],zmm25[5],zmm1[7],zmm25[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm11, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 320(%r10), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 320(%rax), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm17, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm29, %zmm4 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm13[1],zmm18[3],zmm13[3],zmm18[5],zmm13[5],zmm18[7],zmm13[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm17, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%r10), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,5,13] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm10, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm8, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,6,14,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,7,15,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm9, %zmm4 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm6[1],zmm0[1],zmm6[3],zmm0[3],zmm6[5],zmm0[5],zmm6[7],zmm0[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,0,8] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,3,11] +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm5, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm5, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm5, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%r10), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm20 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm18 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm30 ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm11 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm16 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm15, %zmm30 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm14, %zmm29 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm5, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm6, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 384(%r10), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 448(%rax), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm12[1],zmm4[1],zmm12[3],zmm4[3],zmm12[5],zmm4[5],zmm12[7],zmm4[7] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[4],zmm9[6],zmm3[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm9[1],zmm3[1],zmm9[3],zmm3[3],zmm9[5],zmm3[5],zmm9[7],zmm3[7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-NEXT: vmovdqa 128(%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 +; AVX512DQ-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 +; AVX512DQ-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 +; AVX512DQ-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm20 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqa 320(%rcx), %ymm0 -; AVX512DQ-NEXT: vmovdqa 320(%rdx), %ymm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512DQ-NEXT: vmovdqa 320(%rdx), %ymm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-NEXT: vmovdqa 320(%rsi), %ymm4 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa 384(%rcx), %ymm0 -; AVX512DQ-NEXT: vmovdqa 384(%rdx), %ymm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512DQ-NEXT: vmovdqa 384(%rdx), %ymm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-NEXT: vmovdqa 384(%rsi), %ymm4 ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa 448(%rcx), %ymm0 -; AVX512DQ-NEXT: vmovdqa 448(%rdx), %ymm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-NEXT: vmovdqa 448(%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm19 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 +; AVX512DQ-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512DQ-NEXT: vmovdqa 448(%rdx), %ymm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm10 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm3 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, 128(%rcx), %ymm4, %ymm4 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512DQ-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm4 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQ-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %xmm16 -; AVX512DQ-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %xmm17 -; AVX512DQ-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqa 256(%rsi), %xmm9 +; AVX512DQ-NEXT: vinserti128 $1, 256(%rcx), %ymm9, %ymm9 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %xmm16 +; AVX512DQ-NEXT: vinserti32x4 $1, 256(%rdx), %ymm16, %ymm16 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm17, %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqa 320(%rsi), %xmm9 +; AVX512DQ-NEXT: vinserti128 $1, 320(%rcx), %ymm9, %ymm9 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %xmm16 +; AVX512DQ-NEXT: vinserti32x4 $1, 320(%rdx), %ymm16, %ymm16 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm21, %zmm22, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %xmm16 -; AVX512DQ-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %xmm17 -; AVX512DQ-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} -; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %xmm17 -; AVX512DQ-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %xmm20 -; AVX512DQ-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %xmm20 -; AVX512DQ-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %xmm26 -; AVX512DQ-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm24 {%k1} +; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %xmm16 +; AVX512DQ-NEXT: vinserti32x4 $1, 384(%rcx), %ymm16, %ymm16 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %xmm22 +; AVX512DQ-NEXT: vinserti32x4 $1, 384(%rdx), %ymm22, %ymm22 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm22[0],ymm16[0],ymm22[2],ymm16[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm26 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm22[1],ymm16[1],ymm22[3],ymm16[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm26, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm15 {%k1} +; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %xmm22 +; AVX512DQ-NEXT: vinserti32x4 $1, 448(%rcx), %ymm22, %ymm22 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512DQ-NEXT: vinserti32x4 $1, 448(%rdx), %ymm24, %ymm24 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm22, %zmm14, %zmm14 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 3776(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 3712(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 3264(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 3200(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 2240(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 2176(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1728(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1664(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 640(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rax) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 4032(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 3968(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 3904(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 3840(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 3648(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 3584(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 3520(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 3456(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 3392(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 3328(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 3136(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 3072(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 3008(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 2880(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 2624(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 2560(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 2496(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 2432(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 2304(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 2112(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 1984(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 1856(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm8, 1792(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 1472(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 1408(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 1344(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 1280(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1088(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 960(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 896(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 448(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 384(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 3264(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 3200(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 2688(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 2240(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 2176(%rax) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1664(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 4032(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 3968(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 3904(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 3840(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 3648(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 3584(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 3520(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 3392(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 3072(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 3008(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 2880(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 2624(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 2112(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 2048(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1024(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-NEXT: addq $5704, %rsp # imm = 0x1648 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512DQ-FCP-NEXT: subq $5704, %rsp # imm = 0x1648 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512DQ-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm17[1],zmm28[1],zmm17[3],zmm28[3],zmm17[5],zmm28[5],zmm17[7],zmm28[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm16[0],zmm21[0],zmm16[2],zmm21[2],zmm16[4],zmm21[4],zmm16[6],zmm21[6] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm21[1],zmm16[3],zmm21[3],zmm16[5],zmm21[5],zmm16[7],zmm21[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm26 = [7,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm30, %zmm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm7 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm2[1],zmm20[1],zmm2[3],zmm20[3],zmm2[5],zmm20[5],zmm2[7],zmm20[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm18, %zmm8 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm24[0],zmm22[0],zmm24[2],zmm22[2],zmm24[4],zmm22[4],zmm24[6],zmm22[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm24[1],zmm22[1],zmm24[3],zmm22[3],zmm24[5],zmm22[5],zmm24[7],zmm22[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm23 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm12[1],zmm23[1],zmm12[3],zmm23[3],zmm12[5],zmm23[5],zmm12[7],zmm23[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm20[0],zmm31[2],zmm20[2],zmm31[4],zmm20[4],zmm31[6],zmm20[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm31[1],zmm20[1],zmm31[3],zmm20[3],zmm31[5],zmm20[5],zmm31[7],zmm20[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm10[0],zmm0[2],zmm10[2],zmm0[4],zmm10[4],zmm0[6],zmm10[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm10[1],zmm0[3],zmm10[3],zmm0[5],zmm10[5],zmm0[7],zmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r10), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rax), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r8), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm23[0],zmm19[0],zmm23[2],zmm19[2],zmm23[4],zmm19[4],zmm23[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm23[1],zmm19[1],zmm23[3],zmm19[3],zmm23[5],zmm19[5],zmm23[7],zmm19[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r10), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rax), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r8), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm25 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm1[1],zmm25[1],zmm1[3],zmm25[3],zmm1[5],zmm25[5],zmm1[7],zmm25[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r10), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rax), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm13[1],zmm18[3],zmm13[3],zmm18[5],zmm13[5],zmm18[7],zmm13[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r10), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,6,14,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm6[1],zmm0[1],zmm6[3],zmm0[3],zmm6[5],zmm0[5],zmm6[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,0,8] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,3,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r10), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm20 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm18 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,8,0,0] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm30 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm29 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r10), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm12[1],zmm4[1],zmm12[3],zmm4[3],zmm12[5],zmm4[5],zmm12[7],zmm4[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[4],zmm9[6],zmm3[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm9[1],zmm3[1],zmm9[3],zmm3[3],zmm9[5],zmm3[5],zmm9[7],zmm3[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rsi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rsi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdx), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm9 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %xmm16 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %xmm17 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rsi), %xmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 256(%rcx), %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %xmm16 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm16, %ymm16 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rsi), %xmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 320(%rcx), %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %xmm16 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm16, %ymm16 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm22, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %xmm16 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %xmm17 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %xmm17 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %xmm20 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %xmm20 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %xmm26 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %xmm16 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm16, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %xmm22 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm22, %ymm22 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm22[0],ymm16[0],ymm22[2],ymm16[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm22[1],ymm16[1],ymm22[3],ymm16[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm26, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %xmm22 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm22, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm24, %ymm24 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm14, %zmm14 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 3776(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 3712(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 3264(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 3200(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 2240(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 2176(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 1728(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 1664(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 4032(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3968(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3904(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3840(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 3648(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 3584(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3520(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3456(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3392(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3328(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 3136(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 3008(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2880(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 2624(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 2560(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2496(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2432(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2304(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 2112(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1984(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1856(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1792(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1472(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1408(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1344(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1280(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1088(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FCP-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 3264(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 3200(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 2688(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 2240(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 2176(%rax) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1664(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 4032(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3968(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3904(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3840(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 3648(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 3584(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3520(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3392(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 3072(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3008(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2880(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 2624(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 2112(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 2048(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 1024(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-FCP-NEXT: addq $5704, %rsp # imm = 0x1648 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512BW-NEXT: subq $5704, %rsp # imm = 0x1648 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm26 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm24 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm17[1],zmm28[1],zmm17[3],zmm28[3],zmm17[5],zmm28[5],zmm17[7],zmm28[7] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm16[0],zmm21[0],zmm16[2],zmm21[2],zmm16[4],zmm21[4],zmm16[6],zmm21[6] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm11 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm21[1],zmm16[3],zmm21[3],zmm16[5],zmm21[5],zmm16[7],zmm21[7] +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [0,0,7,15] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm26 = [7,15] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm30, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm7 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm2[1],zmm20[1],zmm2[3],zmm20[3],zmm2[5],zmm20[5],zmm2[7],zmm20[7] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,5,13] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm18, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm24[0],zmm22[0],zmm24[2],zmm22[2],zmm24[4],zmm22[4],zmm24[6],zmm22[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm24[1],zmm22[1],zmm24[3],zmm22[3],zmm24[5],zmm22[5],zmm24[7],zmm22[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm12[1],zmm23[1],zmm12[3],zmm23[3],zmm12[5],zmm23[5],zmm12[7],zmm23[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm20[0],zmm31[2],zmm20[2],zmm31[4],zmm20[4],zmm31[6],zmm20[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm31[1],zmm20[1],zmm31[3],zmm20[3],zmm31[5],zmm20[5],zmm31[7],zmm20[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm10[0],zmm0[2],zmm10[2],zmm0[4],zmm10[4],zmm0[6],zmm10[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,12] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm27, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm10[1],zmm0[3],zmm10[3],zmm0[5],zmm10[5],zmm0[7],zmm10[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,13] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm23[0],zmm19[0],zmm23[2],zmm19[2],zmm23[4],zmm19[4],zmm23[6],zmm19[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm23[1],zmm19[1],zmm23[3],zmm19[3],zmm23[5],zmm19[5],zmm23[7],zmm19[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%r10), %zmm18 +; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm1[1],zmm25[1],zmm1[3],zmm25[3],zmm1[5],zmm25[5],zmm1[7],zmm25[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%r10), %zmm13 -; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm18 -; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 320(%r10), %zmm16 -; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm29 -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm13[1],zmm18[3],zmm13[3],zmm18[5],zmm13[5],zmm18[7],zmm13[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm17, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%r10), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm5 +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,5,13] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,6,14,0,0] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,7,15,0,0] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm6[1],zmm0[1],zmm6[3],zmm0[3],zmm6[5],zmm0[5],zmm6[7],zmm0[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,0,8] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,3,11] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%r10), %zmm19 +; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm12 +; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm18 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512BW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm30 ; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm9 +; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm16 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,8,0,0] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm30 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm29 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 -; AVX512BW-NEXT: vmovdqa64 384(%r10), %zmm13 -; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm12[1],zmm4[1],zmm12[3],zmm4[3],zmm12[5],zmm4[5],zmm12[7],zmm4[7] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[4],zmm9[6],zmm3[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm9[1],zmm3[1],zmm9[3],zmm3[3],zmm9[5],zmm3[5],zmm9[7],zmm3[7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 128(%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 +; AVX512BW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 +; AVX512BW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa 320(%rcx), %ymm0 -; AVX512BW-NEXT: vmovdqa 320(%rdx), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512BW-NEXT: vmovdqa 320(%rdx), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512BW-NEXT: vmovdqa 320(%rsi), %ymm4 ; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 384(%rcx), %ymm0 -; AVX512BW-NEXT: vmovdqa 384(%rdx), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512BW-NEXT: vmovdqa 384(%rdx), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512BW-NEXT: vmovdqa 384(%rsi), %ymm4 ; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 448(%rcx), %ymm0 -; AVX512BW-NEXT: vmovdqa 448(%rdx), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 448(%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512BW-NEXT: vmovdqa 448(%rdx), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm3 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa 256(%rsi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, 256(%rcx), %ymm9, %ymm9 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm16, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm18, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa 320(%rsi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, 320(%rcx), %ymm9, %ymm9 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm16, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm22, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %xmm20 -; AVX512BW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %xmm20 -; AVX512BW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm16, %ymm16 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %xmm22 +; AVX512BW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm22, %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm22[0],ymm16[0],ymm22[2],ymm16[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm26 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm22[1],ymm16[1],ymm22[3],ymm16[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm26, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %xmm22 +; AVX512BW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm22, %ymm22 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512BW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm24, %ymm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm14, %zmm14 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 3776(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 3712(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 3264(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 3200(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 2240(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 2176(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 640(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 4032(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 3968(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 3904(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 3840(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 3648(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 3584(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 3520(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 3456(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 3392(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 3328(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 3136(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 3072(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 3008(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 2880(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 2624(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 2560(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 2496(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 2432(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 1984(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 1856(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 1472(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 1408(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 1344(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 960(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 896(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512BW-NEXT: vmovdqa64 %zmm19, 3264(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 3200(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 2688(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 2240(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 2176(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1664(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 4032(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 3968(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 3904(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 3840(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 3648(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 3584(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 3520(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 3392(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 3072(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 3008(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2880(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 2624(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 2112(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 2048(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1024(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-NEXT: addq $5704, %rsp # imm = 0x1648 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride8_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512BW-FCP-NEXT: subq $5704, %rsp # imm = 0x1648 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512BW-FCP-NEXT: movb $-64, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm17[1],zmm28[1],zmm17[3],zmm28[3],zmm17[5],zmm28[5],zmm17[7],zmm28[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm16[0],zmm21[0],zmm16[2],zmm21[2],zmm16[4],zmm21[4],zmm16[6],zmm21[6] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm11 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm21[1],zmm16[3],zmm21[3],zmm16[5],zmm21[5],zmm16[7],zmm21[7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm26 = [7,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm30, %zmm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm7 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm2[1],zmm20[1],zmm2[3],zmm20[3],zmm2[5],zmm20[5],zmm2[7],zmm20[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm18, %zmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm24[0],zmm22[0],zmm24[2],zmm22[2],zmm24[4],zmm22[4],zmm24[6],zmm22[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm4 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm24[1],zmm22[1],zmm24[3],zmm22[3],zmm24[5],zmm22[5],zmm24[7],zmm22[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm23 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm12[1],zmm23[1],zmm12[3],zmm23[3],zmm12[5],zmm23[5],zmm12[7],zmm23[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm20[0],zmm31[2],zmm20[2],zmm31[4],zmm20[4],zmm31[6],zmm20[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm31[1],zmm20[1],zmm31[3],zmm20[3],zmm31[5],zmm20[5],zmm31[7],zmm20[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm10[0],zmm0[2],zmm10[2],zmm0[4],zmm10[4],zmm0[6],zmm10[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm10[1],zmm0[3],zmm10[3],zmm0[5],zmm10[5],zmm0[7],zmm10[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 256(%r10), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm15 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm23[0],zmm19[0],zmm23[2],zmm19[2],zmm23[4],zmm19[4],zmm23[6],zmm19[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm4 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm23[1],zmm19[1],zmm23[3],zmm19[3],zmm23[5],zmm19[5],zmm23[7],zmm19[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 256(%r10), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm25 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm1[1],zmm25[1],zmm1[3],zmm25[3],zmm1[5],zmm25[5],zmm1[7],zmm25[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r10), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm4 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm13[1],zmm18[3],zmm13[3],zmm18[5],zmm13[5],zmm18[7],zmm13[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r10), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,6,14,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm4 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm6[1],zmm0[1],zmm6[3],zmm0[3],zmm6[5],zmm0[5],zmm6[7],zmm0[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,0,8] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,3,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r10), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm20 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm18 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm11 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,8,0,0] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm30 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm29 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r10), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm12[1],zmm4[1],zmm12[3],zmm4[3],zmm12[5],zmm4[5],zmm12[7],zmm4[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm15 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[4],zmm9[6],zmm3[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm9[1],zmm3[1],zmm9[3],zmm3[3],zmm9[5],zmm3[5],zmm9[7],zmm3[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-FCP-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 320(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512BW-FCP-NEXT: vmovdqa 320(%rsi), %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 384(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512BW-FCP-NEXT: vmovdqa 384(%rsi), %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 448(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 256(%rsi), %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, 256(%rcx), %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 320(%rsi), %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, 320(%rcx), %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm22, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %xmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %xmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %xmm26 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %xmm22 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm22, %ymm22 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm22[0],ymm16[0],ymm22[2],ymm16[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm22[1],ymm16[1],ymm22[3],ymm16[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm26, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %xmm22 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm22, %ymm22 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm24, %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm14, %zmm14 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 3776(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 3712(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 3264(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 3200(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 2240(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 2176(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 1728(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 1664(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 640(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rax) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 4032(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3968(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3904(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3840(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 3648(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 3584(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3520(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3456(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3392(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3328(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 3136(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 3008(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2880(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 2624(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 2560(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2496(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2432(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 2304(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 2112(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 1984(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 1856(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm8, 1792(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1472(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1408(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1344(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1280(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1088(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 960(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 896(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 448(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 384(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-FCP-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 3264(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 3200(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 2688(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 2240(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 2176(%rax) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1664(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 4032(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3968(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3904(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3840(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 3648(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 3584(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3520(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3392(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 3072(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3008(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2880(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 2624(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 2112(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 2048(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1024(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-FCP-NEXT: addq $5704, %rsp # imm = 0x1648 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride8_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512DQ-BW-NEXT: subq $5704, %rsp # imm = 0x1648 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512DQ-BW-NEXT: movb $-64, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm17[1],zmm28[1],zmm17[3],zmm28[3],zmm17[5],zmm28[5],zmm17[7],zmm28[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm16[0],zmm21[0],zmm16[2],zmm21[2],zmm16[4],zmm21[4],zmm16[6],zmm21[6] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm11 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm21[1],zmm16[3],zmm21[3],zmm16[5],zmm21[5],zmm16[7],zmm21[7] +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm26 = [7,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm30, %zmm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm7 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm2[1],zmm20[1],zmm2[3],zmm20[3],zmm2[5],zmm20[5],zmm2[7],zmm20[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm18, %zmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm24[0],zmm22[0],zmm24[2],zmm22[2],zmm24[4],zmm22[4],zmm24[6],zmm22[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm4 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm24[1],zmm22[1],zmm24[3],zmm22[3],zmm24[5],zmm22[5],zmm24[7],zmm22[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm23 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm12[1],zmm23[1],zmm12[3],zmm23[3],zmm12[5],zmm23[5],zmm12[7],zmm23[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm20[0],zmm31[2],zmm20[2],zmm31[4],zmm20[4],zmm31[6],zmm20[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm31[1],zmm20[1],zmm31[3],zmm20[3],zmm31[5],zmm20[5],zmm31[7],zmm20[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm10[0],zmm0[2],zmm10[2],zmm0[4],zmm10[4],zmm0[6],zmm10[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm27, %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm10[1],zmm0[3],zmm10[3],zmm0[5],zmm10[5],zmm0[7],zmm10[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 256(%r10), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rax), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%r9), %zmm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm23[0],zmm19[0],zmm23[2],zmm19[2],zmm23[4],zmm19[4],zmm23[6],zmm19[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm4 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm23[1],zmm19[1],zmm23[3],zmm19[3],zmm23[5],zmm19[5],zmm23[7],zmm19[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 256(%r10), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rax), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%r9), %zmm25 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm1[1],zmm25[1],zmm1[3],zmm25[3],zmm1[5],zmm25[5],zmm1[7],zmm25[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 320(%r10), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rax), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm4 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm13[1],zmm18[3],zmm13[3],zmm18[5],zmm13[5],zmm18[7],zmm13[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r10), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm6[1],zmm0[1],zmm6[3],zmm0[3],zmm6[5],zmm0[5],zmm6[7],zmm0[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r10), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm20 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm18 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm11 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm16 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm30 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm29 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r10), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rax), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm12[1],zmm4[1],zmm12[3],zmm4[3],zmm12[5],zmm4[5],zmm12[7],zmm4[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm15 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[4],zmm9[6],zmm3[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm9[1],zmm3[1],zmm9[3],zmm3[3],zmm9[5],zmm3[5],zmm9[7],zmm3[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 256(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 320(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %ymm1 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 384(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %ymm1 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vmovdqa 384(%rsi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 448(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vmovdqa 448(%rsi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512DQ-BW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-BW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-BW-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 448(%rdx), %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-BW-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rcx), %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm9 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %xmm16 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 256(%rsi), %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, 256(%rcx), %ymm9, %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm17, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 320(%rsi), %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, 320(%rcx), %ymm9, %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm21, %zmm22, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %xmm16 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %xmm20 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %xmm20 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %xmm26 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %xmm22 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm22, %ymm22 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm22[0],ymm16[0],ymm22[2],ymm16[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm22[1],ymm16[1],ymm22[3],ymm16[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm26, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %xmm22 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm22, %ymm22 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm24, %ymm24 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm22, %zmm14, %zmm14 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 3776(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 3712(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 3264(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 3200(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 2240(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 2176(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 1728(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 1664(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 640(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rax) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 4032(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3968(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3904(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3840(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 3648(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 3584(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3520(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3456(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3392(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3328(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 3136(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 3072(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 3008(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2880(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 2624(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 2560(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2496(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2432(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 2304(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 2112(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 1984(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 1856(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm8, 1792(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 1472(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 1408(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 1344(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1088(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 960(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 896(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 448(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 384(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-BW-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 3264(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 3200(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 2688(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 2240(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 2176(%rax) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1664(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 4032(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3968(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3904(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3840(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 3648(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 3584(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3520(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3392(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 3072(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3008(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2880(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 2624(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 2112(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 2048(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 1024(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-BW-NEXT: addq $5704, %rsp # imm = 0x1648 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride8_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512DQ-BW-FCP-NEXT: subq $5704, %rsp # imm = 0x1648 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm17[1],zmm28[1],zmm17[3],zmm28[3],zmm17[5],zmm28[5],zmm17[7],zmm28[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm16[0],zmm21[0],zmm16[2],zmm21[2],zmm16[4],zmm21[4],zmm16[6],zmm21[6] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm21[1],zmm16[3],zmm21[3],zmm16[5],zmm21[5],zmm16[7],zmm21[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm26 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm30, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm2[0],zmm20[0],zmm2[2],zmm20[2],zmm2[4],zmm20[4],zmm2[6],zmm20[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm2[1],zmm20[1],zmm2[3],zmm20[3],zmm2[5],zmm20[5],zmm2[7],zmm20[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm18, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm24[0],zmm22[0],zmm24[2],zmm22[2],zmm24[4],zmm22[4],zmm24[6],zmm22[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm24[1],zmm22[1],zmm24[3],zmm22[3],zmm24[5],zmm22[5],zmm24[7],zmm22[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm12[1],zmm23[1],zmm12[3],zmm23[3],zmm12[5],zmm23[5],zmm12[7],zmm23[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm20[0],zmm31[2],zmm20[2],zmm31[4],zmm20[4],zmm31[6],zmm20[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm31[1],zmm20[1],zmm31[3],zmm20[3],zmm31[5],zmm20[5],zmm31[7],zmm20[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm10[0],zmm0[2],zmm10[2],zmm0[4],zmm10[4],zmm0[6],zmm10[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm10[1],zmm0[3],zmm10[3],zmm0[5],zmm10[5],zmm0[7],zmm10[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r10), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm23[0],zmm19[0],zmm23[2],zmm19[2],zmm23[4],zmm19[4],zmm23[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm23[1],zmm19[1],zmm23[3],zmm19[3],zmm23[5],zmm19[5],zmm23[7],zmm19[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r10), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm1[1],zmm25[1],zmm1[3],zmm25[3],zmm1[5],zmm25[5],zmm1[7],zmm25[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r10), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm18[0],zmm13[0],zmm18[2],zmm13[2],zmm18[4],zmm13[4],zmm18[6],zmm13[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm18[1],zmm13[1],zmm18[3],zmm13[3],zmm18[5],zmm13[5],zmm18[7],zmm13[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r10), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,0,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,0,6,14,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm6[1],zmm0[1],zmm6[3],zmm0[3],zmm6[5],zmm0[5],zmm6[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,0,8] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r10), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,0,0,8,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,2,10,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r10), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm12[1],zmm4[1],zmm12[3],zmm4[3],zmm12[5],zmm4[5],zmm12[7],zmm4[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm9[0],zmm3[0],zmm9[2],zmm3[2],zmm9[4],zmm3[4],zmm9[6],zmm3[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm9[1],zmm3[1],zmm9[3],zmm3[3],zmm9[5],zmm3[5],zmm9[7],zmm3[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rcx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rsi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rcx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rsi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rcx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdx), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdx), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm6, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rsi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 256(%rcx), %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 256(%rdx), %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm21, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rsi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 320(%rcx), %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm9[0],ymm16[2],ymm9[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm22, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm16[1],ymm9[1],ymm16[3],ymm9[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm23, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 384(%rcx), %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %xmm22 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 384(%rdx), %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm22[0],ymm16[0],ymm22[2],ymm16[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm23, %zmm24, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm22[1],ymm16[1],ymm22[3],ymm16[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm26, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %xmm22 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 448(%rcx), %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 448(%rdx), %ymm24, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm14, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 3776(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 3712(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 3264(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 3200(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 2240(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 2176(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 1728(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 1664(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 4032(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3968(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3904(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3840(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 3648(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 3584(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3520(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3456(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3392(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3328(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 3136(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 3008(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2880(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2816(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 2624(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 2560(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2496(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2432(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2304(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 2112(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1984(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1856(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1792(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1472(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1408(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1344(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1280(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1088(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 960(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 896(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 768(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 3264(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 3200(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 2688(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 2240(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 2176(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1664(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 4032(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3968(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3904(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3840(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 3648(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 3584(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3520(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3456(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3392(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 3072(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3008(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2880(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 2624(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 2112(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 2048(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 1024(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: addq $5704, %rsp # imm = 0x1648 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 9825c87ee069c..512fd925749a3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1082,8 +1082,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,2,0,0,4,2,0] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[18,22,26],zero,zero,zero,zero,ymm1[19,23,27],zero,zero,zero,zero ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,3,1,5,0] @@ -1094,8 +1093,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,zero,zero,ymm4[u,u,u,19,27],zero,zero,ymm4[u,u,u,20,28],zero,zero ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = mem & (ymm4 | ymm1) -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,9,11,1,3,9,11] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,9,11,1,3,9,11] ; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 @@ -1164,8 +1162,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,2,0,0,4,2,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[18,22,26],zero,zero,zero,zero,ymm1[19,23,27],zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,3,1,5,0] @@ -1176,8 +1173,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,zero,zero,ymm4[u,u,u,19,27],zero,zero,ymm4[u,u,u,20,28],zero,zero ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = mem & (ymm4 | ymm1) -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,9,11,1,3,9,11] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,9,11,1,3,9,11] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 @@ -1246,14 +1242,12 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,2,6,0,4,2,6] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,2,0,0,4,2,0] ; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm4[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] ; AVX512BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm4[1,5,9,13],zero,zero,zero,ymm4[2,6,10,14],zero,zero,zero,ymm4[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,3,0,3,1,5,0] @@ -1324,14 +1318,12 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,2,6,0,4,2,6] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,2,0,0,4,2,0] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm4[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm4[1,5,9,13],zero,zero,zero,ymm4[2,6,10,14],zero,zero,zero,ymm4[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,3,0,3,1,5,0] @@ -2116,8 +2108,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,5,2,6,1,5,2,6] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,0,0,1,5,2,6] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,zero,ymm7[1,5,u,u,u],zero,zero,ymm7[2,6,u,u,u],zero,zero,ymm7[19,23,u,u,u],zero,zero,ymm7[24,28,u,u,u],zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 @@ -2132,8 +2123,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,9,2,10,1,9,2,10] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,9,0,0,1,9,2,10] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,4],zero,ymm6[u,u,u,u,1,5],zero,ymm6[u,u,u,u,2,6],zero,ymm6[u,u,u,u,19,23],zero,ymm6[u,u,u,u,24,28],zero,ymm6[u] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 @@ -2148,8 +2138,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] ; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,0,3,2,0,3,0] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] @@ -2253,8 +2242,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,5,2,6,1,5,2,6] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,5,0,0,1,5,2,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,zero,ymm7[1,5,u,u,u],zero,zero,ymm7[2,6,u,u,u],zero,zero,ymm7[19,23,u,u,u],zero,zero,ymm7[24,28,u,u,u],zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 @@ -2269,8 +2257,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,9,0,0,1,9,2,10] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,4],zero,ymm6[u,u,u,u,1,5],zero,ymm6[u,u,u,u,2,6],zero,ymm6[u,u,u,u,19,23],zero,ymm6[u,u,u,u,24,28],zero,ymm6[u] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 @@ -2285,8 +2272,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] ; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,0,3,2,0,3,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] @@ -2398,8 +2384,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm4 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,2,10,1,9,2,10] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,9,0,0,1,9,2,10] ; AVX512BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] @@ -2545,8 +2530,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,9,0,0,1,9,2,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] @@ -4326,8 +4310,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,2,0,3,2,0,3,0] ; AVX512-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 @@ -4591,8 +4574,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,2,0,3,2,0,3,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 @@ -8984,8 +8966,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[2,3,2,3],zmm0[2,3,2,3] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,0,3,2,0,3,0] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] @@ -9711,8 +9692,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,0,3,2,0,3,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index a9da7abaa945c..d7774e9a9b739 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -1033,8 +1033,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] @@ -1043,8 +1042,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [201851904,218694913,235537922,252380931] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm0 @@ -1116,8 +1114,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,9,11,1,3,9,11] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] @@ -1126,8 +1123,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [201851904,218694913,235537922,252380931] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm0 @@ -2099,8 +2095,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,2,0,2,0,2,0,2] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,0,2] ; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128,128,128] ; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 @@ -2120,8 +2115,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vpord %zmm7, %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] ; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 @@ -2195,8 +2189,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,2,0,2,0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,0,2] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128,128,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 @@ -2216,8 +2209,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vpord %zmm7, %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll index b5b9af543ed5c..cc921e7f6ca44 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -5795,8 +5795,7 @@ define <4 x i32> @ugt_1_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_1_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_1_v4i32: @@ -6041,8 +6040,7 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_2_v4i32: @@ -6069,8 +6067,7 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -6415,8 +6412,7 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_3_v4i32: @@ -6443,8 +6439,7 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -6789,8 +6784,7 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_4_v4i32: @@ -6817,8 +6811,7 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -7163,8 +7156,7 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_5_v4i32: @@ -7191,8 +7183,7 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -7537,8 +7528,7 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_6_v4i32: @@ -7565,8 +7555,7 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -7911,8 +7900,7 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_7_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_7_v4i32: @@ -7939,8 +7927,7 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -8285,8 +8272,7 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_8_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,8,8,8] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_8_v4i32: @@ -8313,8 +8299,7 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,8,8,8] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -8659,8 +8644,7 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_9_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_9_v4i32: @@ -8687,8 +8671,7 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -9033,8 +9016,7 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_10_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,10,10,10] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_10_v4i32: @@ -9061,8 +9043,7 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,10,10,10] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -9407,8 +9388,7 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_11_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [11,11,11,11] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_11_v4i32: @@ -9435,8 +9415,7 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [11,11,11,11] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -9781,8 +9760,7 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_12_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [12,12,12,12] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_12_v4i32: @@ -9809,8 +9787,7 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [12,12,12,12] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -10155,8 +10132,7 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_13_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [13,13,13,13] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_13_v4i32: @@ -10183,8 +10159,7 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [13,13,13,13] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -10529,8 +10504,7 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_14_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [14,14,14,14] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_14_v4i32: @@ -10557,8 +10531,7 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [14,14,14,14] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -10903,8 +10876,7 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_15_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_15_v4i32: @@ -10931,8 +10903,7 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -11277,8 +11248,7 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_16_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_16_v4i32: @@ -11305,8 +11275,7 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -11651,8 +11620,7 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_17_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [17,17,17,17] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_17_v4i32: @@ -11679,8 +11647,7 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [17,17,17,17] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -12025,8 +11992,7 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_18_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18,18,18,18] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_18_v4i32: @@ -12053,8 +12019,7 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18,18,18,18] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -12399,8 +12364,7 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_19_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [19,19,19,19] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_19_v4i32: @@ -12427,8 +12391,7 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [19,19,19,19] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -12773,8 +12736,7 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_20_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_20_v4i32: @@ -12801,8 +12763,7 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -13147,8 +13108,7 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_21_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [21,21,21,21] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_21_v4i32: @@ -13175,8 +13135,7 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [21,21,21,21] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -13521,8 +13480,7 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_22_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [22,22,22,22] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_22_v4i32: @@ -13549,8 +13507,7 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [22,22,22,22] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -13895,8 +13852,7 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_23_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [23,23,23,23] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_23_v4i32: @@ -13923,8 +13879,7 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [23,23,23,23] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -14269,8 +14224,7 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_24_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [24,24,24,24] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_24_v4i32: @@ -14297,8 +14251,7 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [24,24,24,24] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -14643,8 +14596,7 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_25_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [25,25,25,25] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_25_v4i32: @@ -14671,8 +14623,7 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [25,25,25,25] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -15017,8 +14968,7 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_26_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [26,26,26,26] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_26_v4i32: @@ -15045,8 +14995,7 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [26,26,26,26] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -15391,8 +15340,7 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_27_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [27,27,27,27] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_27_v4i32: @@ -15419,8 +15367,7 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [27,27,27,27] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -15765,8 +15712,7 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_28_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28,28,28,28] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_28_v4i32: @@ -15793,8 +15739,7 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28,28,28,28] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -16139,8 +16084,7 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_29_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [29,29,29,29] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_29_v4i32: @@ -16167,8 +16111,7 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [29,29,29,29] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -16513,8 +16456,7 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_30_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [30,30,30,30] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_30_v4i32: @@ -16541,8 +16483,7 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) { ; BITALG-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} xmm1 = [30,30,30,30] -; BITALG-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ugt <4 x i32> %2, @@ -16821,8 +16762,7 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_1_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_1_v2i64: @@ -16925,7 +16865,7 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_2_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17074,8 +17014,7 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_2_v2i64: @@ -17093,8 +17032,7 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -17231,7 +17169,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_3_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17251,7 +17189,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -17382,8 +17320,7 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_3_v2i64: @@ -17401,8 +17338,7 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -17539,7 +17475,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_4_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17559,7 +17495,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -17690,8 +17626,7 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_4_v2i64: @@ -17709,8 +17644,7 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -17847,7 +17781,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_5_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17867,7 +17801,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -17998,8 +17932,7 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_5_v2i64: @@ -18017,8 +17950,7 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -18155,7 +18087,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_6_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18175,7 +18107,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -18306,8 +18238,7 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_6_v2i64: @@ -18325,8 +18256,7 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -18463,7 +18393,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_7_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18483,7 +18413,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -18614,8 +18544,7 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_7_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_7_v2i64: @@ -18633,8 +18562,7 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -18771,7 +18699,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_8_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18791,7 +18719,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -18922,8 +18850,7 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_8_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_8_v2i64: @@ -18941,8 +18868,7 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -19079,7 +19005,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_9_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -19099,7 +19025,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -19230,8 +19156,7 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_9_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_9_v2i64: @@ -19249,8 +19174,7 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -19387,7 +19311,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_10_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -19407,7 +19331,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -19538,8 +19462,7 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_10_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_10_v2i64: @@ -19557,8 +19480,7 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -19695,7 +19617,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_11_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -19715,7 +19637,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -19846,8 +19768,7 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_11_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_11_v2i64: @@ -19865,8 +19786,7 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -20003,7 +19923,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_12_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -20023,7 +19943,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -20154,8 +20074,7 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_12_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_12_v2i64: @@ -20173,8 +20092,7 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -20311,7 +20229,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_13_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -20331,7 +20249,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -20462,8 +20380,7 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_13_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_13_v2i64: @@ -20481,8 +20398,7 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -20619,7 +20535,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_14_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -20639,7 +20555,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -20770,8 +20686,7 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_14_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_14_v2i64: @@ -20789,8 +20704,7 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -20927,7 +20841,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_15_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -20947,7 +20861,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -21078,8 +20992,7 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_15_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_15_v2i64: @@ -21097,8 +21010,7 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -21235,7 +21147,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_16_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -21255,7 +21167,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -21386,8 +21298,7 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_16_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_16_v2i64: @@ -21405,8 +21316,7 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -21543,7 +21453,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_17_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -21563,7 +21473,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -21694,8 +21604,7 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_17_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_17_v2i64: @@ -21713,8 +21622,7 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -21851,7 +21759,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_18_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -21871,7 +21779,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -22002,8 +21910,7 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_18_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_18_v2i64: @@ -22021,8 +21928,7 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -22159,7 +22065,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_19_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -22179,7 +22085,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -22310,8 +22216,7 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_19_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_19_v2i64: @@ -22329,8 +22234,7 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -22467,7 +22371,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_20_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -22487,7 +22391,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -22618,8 +22522,7 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_20_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_20_v2i64: @@ -22637,8 +22540,7 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -22775,7 +22677,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_21_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -22795,7 +22697,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -22926,8 +22828,7 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_21_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_21_v2i64: @@ -22945,8 +22846,7 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -23083,7 +22983,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_22_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -23103,7 +23003,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -23234,8 +23134,7 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_22_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_22_v2i64: @@ -23253,8 +23152,7 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -23391,7 +23289,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_23_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -23411,7 +23309,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -23542,8 +23440,7 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_23_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_23_v2i64: @@ -23561,8 +23458,7 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -23699,7 +23595,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_24_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -23719,7 +23615,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -23850,8 +23746,7 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_24_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_24_v2i64: @@ -23869,8 +23764,7 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -24007,7 +23901,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_25_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -24027,7 +23921,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -24158,8 +24052,7 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_25_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_25_v2i64: @@ -24177,8 +24070,7 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -24315,7 +24207,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_26_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -24335,7 +24227,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -24466,8 +24358,7 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_26_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_26_v2i64: @@ -24485,8 +24376,7 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -24623,7 +24513,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_27_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -24643,7 +24533,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -24774,8 +24664,7 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_27_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_27_v2i64: @@ -24793,8 +24682,7 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -24931,7 +24819,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_28_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -24951,7 +24839,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -25082,8 +24970,7 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_28_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_28_v2i64: @@ -25101,8 +24988,7 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -25239,7 +25125,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_29_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -25259,7 +25145,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -25390,8 +25276,7 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_29_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_29_v2i64: @@ -25409,8 +25294,7 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -25547,7 +25431,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_30_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -25567,7 +25451,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -25698,8 +25582,7 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_30_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_30_v2i64: @@ -25717,8 +25600,7 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -25855,7 +25737,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_31_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -25875,7 +25757,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -26006,8 +25888,7 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_31_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_31_v2i64: @@ -26025,8 +25906,7 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -26163,7 +26043,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_32_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -26183,7 +26063,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -26314,8 +26194,7 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_32_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_32_v2i64: @@ -26333,8 +26212,7 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -26471,7 +26349,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_33_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -26491,7 +26369,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -26622,8 +26500,7 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_33_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_33_v2i64: @@ -26641,8 +26518,7 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -26779,7 +26655,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_34_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -26799,7 +26675,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -26930,8 +26806,7 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_34_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_34_v2i64: @@ -26949,8 +26824,7 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -27087,7 +26961,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_35_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -27107,7 +26981,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -27238,8 +27112,7 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_35_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_35_v2i64: @@ -27257,8 +27130,7 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -27395,7 +27267,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_36_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -27415,7 +27287,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -27546,8 +27418,7 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_36_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_36_v2i64: @@ -27565,8 +27436,7 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -27703,7 +27573,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_37_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -27723,7 +27593,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -27854,8 +27724,7 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_37_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_37_v2i64: @@ -27873,8 +27742,7 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -28011,7 +27879,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_38_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -28031,7 +27899,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -28162,8 +28030,7 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_38_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_38_v2i64: @@ -28181,8 +28048,7 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -28319,7 +28185,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_39_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -28339,7 +28205,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -28470,8 +28336,7 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_39_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_39_v2i64: @@ -28489,8 +28354,7 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -28627,7 +28491,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_40_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -28647,7 +28511,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -28778,8 +28642,7 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_40_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_40_v2i64: @@ -28797,8 +28660,7 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -28935,7 +28797,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_41_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -28955,7 +28817,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -29086,8 +28948,7 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_41_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_41_v2i64: @@ -29105,8 +28966,7 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -29243,7 +29103,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_42_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -29263,7 +29123,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -29394,8 +29254,7 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_42_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_42_v2i64: @@ -29413,8 +29272,7 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -29551,7 +29409,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_43_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -29571,7 +29429,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -29702,8 +29560,7 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_43_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_43_v2i64: @@ -29721,8 +29578,7 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -29859,7 +29715,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_44_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -29879,7 +29735,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -30010,8 +29866,7 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_44_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_44_v2i64: @@ -30029,8 +29884,7 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -30167,7 +30021,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_45_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -30187,7 +30041,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -30318,8 +30172,7 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_45_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_45_v2i64: @@ -30337,8 +30190,7 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -30475,7 +30327,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_46_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -30495,7 +30347,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -30626,8 +30478,7 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_46_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_46_v2i64: @@ -30645,8 +30496,7 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -30783,7 +30633,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_47_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -30803,7 +30653,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -30934,8 +30784,7 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_47_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_47_v2i64: @@ -30953,8 +30802,7 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -31091,7 +30939,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_48_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -31111,7 +30959,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -31242,8 +31090,7 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_48_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_48_v2i64: @@ -31261,8 +31108,7 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -31399,7 +31245,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_49_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -31419,7 +31265,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -31550,8 +31396,7 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_49_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_49_v2i64: @@ -31569,8 +31414,7 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -31707,7 +31551,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_50_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -31727,7 +31571,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -31858,8 +31702,7 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_50_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_50_v2i64: @@ -31877,8 +31720,7 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -32015,7 +31857,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_51_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -32035,7 +31877,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -32166,8 +32008,7 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_51_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_51_v2i64: @@ -32185,8 +32026,7 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -32323,7 +32163,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_52_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -32343,7 +32183,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -32474,8 +32314,7 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_52_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_52_v2i64: @@ -32493,8 +32332,7 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -32631,7 +32469,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_53_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -32651,7 +32489,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -32782,8 +32620,7 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_53_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_53_v2i64: @@ -32801,8 +32638,7 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -32939,7 +32775,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_54_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -32959,7 +32795,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -33090,8 +32926,7 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_54_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_54_v2i64: @@ -33109,8 +32944,7 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -33247,7 +33081,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_55_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -33267,7 +33101,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -33398,8 +33232,7 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_55_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_55_v2i64: @@ -33417,8 +33250,7 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -33555,7 +33387,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_56_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -33575,7 +33407,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -33706,8 +33538,7 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_56_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_56_v2i64: @@ -33725,8 +33556,7 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -33863,7 +33693,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_57_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -33883,7 +33713,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -34014,8 +33844,7 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_57_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_57_v2i64: @@ -34033,8 +33862,7 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -34171,7 +33999,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_58_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -34191,7 +34019,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -34322,8 +34150,7 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_58_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_58_v2i64: @@ -34341,8 +34168,7 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -34479,7 +34305,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_59_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -34499,7 +34325,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -34630,8 +34456,7 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_59_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_59_v2i64: @@ -34649,8 +34474,7 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -34787,7 +34611,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_60_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -34807,7 +34631,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -34938,8 +34762,7 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_60_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_60_v2i64: @@ -34957,8 +34780,7 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -35095,7 +34917,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_61_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -35115,7 +34937,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -35246,8 +35068,7 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_61_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_61_v2i64: @@ -35265,8 +35086,7 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -35403,7 +35223,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_62_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -35423,7 +35243,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -35554,8 +35374,7 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_62_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_62_v2i64: @@ -35573,8 +35392,7 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] -; BITALG-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ugt <2 x i64> %2, @@ -35711,7 +35529,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_63_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -35731,7 +35549,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] +; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] ; BITALG-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll index c1d30b6d5a995..fec72fe760de8 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -784,8 +784,7 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: eq_1_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: eq_1_v2i64: @@ -864,8 +863,7 @@ define <2 x i64> @ne_1_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ne_1_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -962,8 +960,7 @@ define <4 x i32> @eq_1_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: eq_1_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: eq_1_v4i32: @@ -1066,8 +1063,7 @@ define <4 x i32> @ne_1_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ne_1_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll index 487f9a5d326cf..7d8926f03fd5d 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -3337,8 +3337,7 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_1_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_1_v8i32: @@ -3496,8 +3495,7 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_2_v8i32: @@ -3523,8 +3521,7 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -3701,8 +3698,7 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_3_v8i32: @@ -3728,8 +3724,7 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -3906,8 +3901,7 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_4_v8i32: @@ -3933,8 +3927,7 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -4111,8 +4104,7 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_5_v8i32: @@ -4138,8 +4130,7 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -4316,8 +4307,7 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_6_v8i32: @@ -4343,8 +4333,7 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -4521,8 +4510,7 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_7_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_7_v8i32: @@ -4548,8 +4536,7 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -4726,8 +4713,7 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_8_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_8_v8i32: @@ -4753,8 +4739,7 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -4931,8 +4916,7 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_9_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_9_v8i32: @@ -4958,8 +4942,7 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -5136,8 +5119,7 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_10_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_10_v8i32: @@ -5163,8 +5145,7 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -5341,8 +5322,7 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_11_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_11_v8i32: @@ -5368,8 +5348,7 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -5546,8 +5525,7 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_12_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_12_v8i32: @@ -5573,8 +5551,7 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -5751,8 +5728,7 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_13_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_13_v8i32: @@ -5778,8 +5754,7 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -5956,8 +5931,7 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_14_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_14_v8i32: @@ -5983,8 +5957,7 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -6161,8 +6134,7 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_15_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_15_v8i32: @@ -6188,8 +6160,7 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -6366,8 +6337,7 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_16_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_16_v8i32: @@ -6393,8 +6363,7 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -6571,8 +6540,7 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_17_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_17_v8i32: @@ -6598,8 +6566,7 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -6776,8 +6743,7 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_18_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,18,18,18,18,18,18,18] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_18_v8i32: @@ -6803,8 +6769,7 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,18,18,18,18,18,18,18] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -6981,8 +6946,7 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_19_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [19,19,19,19,19,19,19,19] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_19_v8i32: @@ -7008,8 +6972,7 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [19,19,19,19,19,19,19,19] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -7186,8 +7149,7 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_20_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,20,20,20,20,20,20,20] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_20_v8i32: @@ -7213,8 +7175,7 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,20,20,20,20,20,20,20] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -7391,8 +7352,7 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_21_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [21,21,21,21,21,21,21,21] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_21_v8i32: @@ -7418,8 +7378,7 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [21,21,21,21,21,21,21,21] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -7596,8 +7555,7 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_22_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [22,22,22,22,22,22,22,22] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_22_v8i32: @@ -7623,8 +7581,7 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [22,22,22,22,22,22,22,22] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -7801,8 +7758,7 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_23_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [23,23,23,23,23,23,23,23] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_23_v8i32: @@ -7828,8 +7784,7 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [23,23,23,23,23,23,23,23] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -8006,8 +7961,7 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_24_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [24,24,24,24,24,24,24,24] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_24_v8i32: @@ -8033,8 +7987,7 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [24,24,24,24,24,24,24,24] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -8211,8 +8164,7 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_25_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [25,25,25,25,25,25,25,25] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_25_v8i32: @@ -8238,8 +8190,7 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [25,25,25,25,25,25,25,25] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -8416,8 +8367,7 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_26_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [26,26,26,26,26,26,26,26] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_26_v8i32: @@ -8443,8 +8393,7 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [26,26,26,26,26,26,26,26] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -8621,8 +8570,7 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_27_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [27,27,27,27,27,27,27,27] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_27_v8i32: @@ -8648,8 +8596,7 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [27,27,27,27,27,27,27,27] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -8826,8 +8773,7 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_28_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [28,28,28,28,28,28,28,28] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_28_v8i32: @@ -8853,8 +8799,7 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [28,28,28,28,28,28,28,28] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -9031,8 +8976,7 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_29_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [29,29,29,29,29,29,29,29] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_29_v8i32: @@ -9058,8 +9002,7 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [29,29,29,29,29,29,29,29] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -9236,8 +9179,7 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_30_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [30,30,30,30,30,30,30,30] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_30_v8i32: @@ -9263,8 +9205,7 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastd {{.*#+}} ymm1 = [30,30,30,30,30,30,30,30] -; BITALG-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ugt <8 x i32> %2, @@ -9412,8 +9353,7 @@ define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_1_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_1_v4i64: @@ -9477,7 +9417,7 @@ define <4 x i64> @ult_2_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_2_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [2,2,2,2] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -9559,8 +9499,7 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_2_v4i64: @@ -9578,8 +9517,7 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -9642,7 +9580,7 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_3_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -9661,7 +9599,7 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,3,3,3] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -9725,8 +9663,7 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_3_v4i64: @@ -9744,8 +9681,7 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -9808,7 +9744,7 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_4_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -9827,7 +9763,7 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,4,4,4] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -9891,8 +9827,7 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_4_v4i64: @@ -9910,8 +9845,7 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -9974,7 +9908,7 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_5_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -9993,7 +9927,7 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,5,5,5] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,5,5,5] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -10057,8 +9991,7 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,5,5,5] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_5_v4i64: @@ -10076,8 +10009,7 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,5,5,5] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -10140,7 +10072,7 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_6_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -10159,7 +10091,7 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,6,6,6] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -10223,8 +10155,7 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_6_v4i64: @@ -10242,8 +10173,7 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -10306,7 +10236,7 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_7_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -10325,7 +10255,7 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7,7,7,7] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,7,7,7] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -10389,8 +10319,7 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_7_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7,7,7,7] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_7_v4i64: @@ -10408,8 +10337,7 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7,7,7,7] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -10472,7 +10400,7 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_8_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [8,8,8,8] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -10491,7 +10419,7 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,8,8,8] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [8,8,8,8] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -10555,8 +10483,7 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_8_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,8,8,8] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_8_v4i64: @@ -10574,8 +10501,7 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,8,8,8] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -10638,7 +10564,7 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_9_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,9,9,9] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -10657,7 +10583,7 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,9,9,9] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,9,9,9] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -10721,8 +10647,7 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_9_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,9,9,9] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_9_v4i64: @@ -10740,8 +10665,7 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,9,9,9] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -10804,7 +10728,7 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_10_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [10,10,10,10] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -10823,7 +10747,7 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [10,10,10,10] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [10,10,10,10] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -10887,8 +10811,7 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_10_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [10,10,10,10] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_10_v4i64: @@ -10906,8 +10829,7 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [10,10,10,10] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -10970,7 +10892,7 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_11_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [11,11,11,11] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -10989,7 +10911,7 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,11,11,11] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [11,11,11,11] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -11053,8 +10975,7 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_11_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,11,11,11] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_11_v4i64: @@ -11072,8 +10993,7 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,11,11,11] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -11136,7 +11056,7 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_12_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [12,12,12,12] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -11155,7 +11075,7 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [12,12,12,12] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [12,12,12,12] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -11219,8 +11139,7 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_12_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [12,12,12,12] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_12_v4i64: @@ -11238,8 +11157,7 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [12,12,12,12] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -11302,7 +11220,7 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_13_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [13,13,13,13] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -11321,7 +11239,7 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,13,13,13] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [13,13,13,13] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -11385,8 +11303,7 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_13_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,13,13,13] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_13_v4i64: @@ -11404,8 +11321,7 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,13,13,13] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -11468,7 +11384,7 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_14_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [14,14,14,14] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -11487,7 +11403,7 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [14,14,14,14] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [14,14,14,14] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -11551,8 +11467,7 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_14_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [14,14,14,14] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_14_v4i64: @@ -11570,8 +11485,7 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [14,14,14,14] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -11634,7 +11548,7 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_15_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -11653,7 +11567,7 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [15,15,15,15] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,15,15,15] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -11717,8 +11631,7 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_15_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [15,15,15,15] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_15_v4i64: @@ -11736,8 +11649,7 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [15,15,15,15] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -11800,7 +11712,7 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_16_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,16,16,16] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [16,16,16,16] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -11819,7 +11731,7 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,16,16,16] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [16,16,16,16] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -11883,8 +11795,7 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_16_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,16,16,16] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_16_v4i64: @@ -11902,8 +11813,7 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,16,16,16] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -11966,7 +11876,7 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_17_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17,17,17,17] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [17,17,17,17] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -11985,7 +11895,7 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17,17,17,17] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [17,17,17,17] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -12049,8 +11959,7 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_17_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17,17,17,17] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_17_v4i64: @@ -12068,8 +11977,7 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17,17,17,17] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -12132,7 +12040,7 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_18_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,18,18,18] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [18,18,18,18] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -12151,7 +12059,7 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,18,18,18] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [18,18,18,18] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -12215,8 +12123,7 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_18_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,18,18,18] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_18_v4i64: @@ -12234,8 +12141,7 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,18,18,18] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -12298,7 +12204,7 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_19_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [19,19,19,19] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [19,19,19,19] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -12317,7 +12223,7 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [19,19,19,19] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [19,19,19,19] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -12381,8 +12287,7 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_19_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [19,19,19,19] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_19_v4i64: @@ -12400,8 +12305,7 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [19,19,19,19] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -12464,7 +12368,7 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_20_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [20,20,20,20] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [20,20,20,20] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -12483,7 +12387,7 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [20,20,20,20] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [20,20,20,20] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -12547,8 +12451,7 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_20_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [20,20,20,20] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_20_v4i64: @@ -12566,8 +12469,7 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [20,20,20,20] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -12630,7 +12532,7 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_21_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [21,21,21,21] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [21,21,21,21] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -12649,7 +12551,7 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [21,21,21,21] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [21,21,21,21] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -12713,8 +12615,7 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_21_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [21,21,21,21] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_21_v4i64: @@ -12732,8 +12633,7 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [21,21,21,21] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -12796,7 +12696,7 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_22_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [22,22,22,22] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [22,22,22,22] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -12815,7 +12715,7 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [22,22,22,22] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [22,22,22,22] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -12879,8 +12779,7 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_22_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [22,22,22,22] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_22_v4i64: @@ -12898,8 +12797,7 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [22,22,22,22] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -12962,7 +12860,7 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_23_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [23,23,23,23] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [23,23,23,23] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -12981,7 +12879,7 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [23,23,23,23] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [23,23,23,23] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -13045,8 +12943,7 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_23_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [23,23,23,23] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_23_v4i64: @@ -13064,8 +12961,7 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [23,23,23,23] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -13128,7 +13024,7 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_24_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [24,24,24,24] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [24,24,24,24] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -13147,7 +13043,7 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [24,24,24,24] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [24,24,24,24] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -13211,8 +13107,7 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_24_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [24,24,24,24] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_24_v4i64: @@ -13230,8 +13125,7 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [24,24,24,24] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -13294,7 +13188,7 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_25_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25,25,25,25] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [25,25,25,25] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -13313,7 +13207,7 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25,25,25,25] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [25,25,25,25] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -13377,8 +13271,7 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_25_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25,25,25,25] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_25_v4i64: @@ -13396,8 +13289,7 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25,25,25,25] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -13460,7 +13352,7 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_26_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [26,26,26,26] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [26,26,26,26] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -13479,7 +13371,7 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [26,26,26,26] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [26,26,26,26] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -13543,8 +13435,7 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_26_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [26,26,26,26] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_26_v4i64: @@ -13562,8 +13453,7 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [26,26,26,26] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -13626,7 +13516,7 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_27_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [27,27,27,27] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [27,27,27,27] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -13645,7 +13535,7 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [27,27,27,27] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [27,27,27,27] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -13709,8 +13599,7 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_27_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [27,27,27,27] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_27_v4i64: @@ -13728,8 +13617,7 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [27,27,27,27] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -13792,7 +13680,7 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_28_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,28,28,28] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [28,28,28,28] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -13811,7 +13699,7 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,28,28,28] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [28,28,28,28] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -13875,8 +13763,7 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_28_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,28,28,28] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_28_v4i64: @@ -13894,8 +13781,7 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,28,28,28] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -13958,7 +13844,7 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_29_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [29,29,29,29] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [29,29,29,29] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -13977,7 +13863,7 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [29,29,29,29] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [29,29,29,29] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -14041,8 +13927,7 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_29_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [29,29,29,29] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_29_v4i64: @@ -14060,8 +13945,7 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [29,29,29,29] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -14124,7 +14008,7 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_30_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,30,30,30] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [30,30,30,30] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -14143,7 +14027,7 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,30,30,30] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [30,30,30,30] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -14207,8 +14091,7 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_30_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,30,30,30] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_30_v4i64: @@ -14226,8 +14109,7 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,30,30,30] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -14290,7 +14172,7 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_31_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [31,31,31,31] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [31,31,31,31] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -14309,7 +14191,7 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [31,31,31,31] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [31,31,31,31] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -14373,8 +14255,7 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_31_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [31,31,31,31] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_31_v4i64: @@ -14392,8 +14273,7 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [31,31,31,31] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -14456,7 +14336,7 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_32_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32,32,32,32] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [32,32,32,32] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -14475,7 +14355,7 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32,32,32,32] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [32,32,32,32] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -14539,8 +14419,7 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_32_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32,32,32,32] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_32_v4i64: @@ -14558,8 +14437,7 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32,32,32,32] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -14622,7 +14500,7 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_33_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [33,33,33,33] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [33,33,33,33] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -14641,7 +14519,7 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [33,33,33,33] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [33,33,33,33] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -14705,8 +14583,7 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_33_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [33,33,33,33] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_33_v4i64: @@ -14724,8 +14601,7 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [33,33,33,33] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -14788,7 +14664,7 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_34_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [34,34,34,34] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [34,34,34,34] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -14807,7 +14683,7 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [34,34,34,34] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [34,34,34,34] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -14871,8 +14747,7 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_34_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [34,34,34,34] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_34_v4i64: @@ -14890,8 +14765,7 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [34,34,34,34] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -14954,7 +14828,7 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_35_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [35,35,35,35] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [35,35,35,35] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -14973,7 +14847,7 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [35,35,35,35] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [35,35,35,35] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -15037,8 +14911,7 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_35_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [35,35,35,35] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_35_v4i64: @@ -15056,8 +14929,7 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [35,35,35,35] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -15120,7 +14992,7 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_36_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [36,36,36,36] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [36,36,36,36] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -15139,7 +15011,7 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [36,36,36,36] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [36,36,36,36] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -15203,8 +15075,7 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_36_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [36,36,36,36] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_36_v4i64: @@ -15222,8 +15093,7 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [36,36,36,36] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -15286,7 +15156,7 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_37_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [37,37,37,37] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [37,37,37,37] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -15305,7 +15175,7 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [37,37,37,37] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [37,37,37,37] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -15369,8 +15239,7 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_37_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [37,37,37,37] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_37_v4i64: @@ -15388,8 +15257,7 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [37,37,37,37] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -15452,7 +15320,7 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_38_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [38,38,38,38] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [38,38,38,38] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -15471,7 +15339,7 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [38,38,38,38] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [38,38,38,38] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -15535,8 +15403,7 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_38_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [38,38,38,38] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_38_v4i64: @@ -15554,8 +15421,7 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [38,38,38,38] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -15618,7 +15484,7 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_39_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [39,39,39,39] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [39,39,39,39] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -15637,7 +15503,7 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [39,39,39,39] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [39,39,39,39] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -15701,8 +15567,7 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_39_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [39,39,39,39] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_39_v4i64: @@ -15720,8 +15585,7 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [39,39,39,39] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -15784,7 +15648,7 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_40_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [40,40,40,40] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [40,40,40,40] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -15803,7 +15667,7 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [40,40,40,40] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [40,40,40,40] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -15867,8 +15731,7 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_40_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [40,40,40,40] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_40_v4i64: @@ -15886,8 +15749,7 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [40,40,40,40] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -15950,7 +15812,7 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_41_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [41,41,41,41] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [41,41,41,41] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -15969,7 +15831,7 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [41,41,41,41] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [41,41,41,41] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -16033,8 +15895,7 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_41_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [41,41,41,41] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_41_v4i64: @@ -16052,8 +15913,7 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [41,41,41,41] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -16116,7 +15976,7 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_42_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [42,42,42,42] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -16135,7 +15995,7 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [42,42,42,42] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -16199,8 +16059,7 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_42_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_42_v4i64: @@ -16218,8 +16077,7 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -16282,7 +16140,7 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_43_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [43,43,43,43] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [43,43,43,43] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -16301,7 +16159,7 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [43,43,43,43] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [43,43,43,43] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -16365,8 +16223,7 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_43_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [43,43,43,43] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_43_v4i64: @@ -16384,8 +16241,7 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [43,43,43,43] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -16448,7 +16304,7 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_44_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [44,44,44,44] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [44,44,44,44] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -16467,7 +16323,7 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [44,44,44,44] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [44,44,44,44] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -16531,8 +16387,7 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_44_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [44,44,44,44] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_44_v4i64: @@ -16550,8 +16405,7 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [44,44,44,44] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -16614,7 +16468,7 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_45_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [45,45,45,45] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [45,45,45,45] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -16633,7 +16487,7 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [45,45,45,45] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [45,45,45,45] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -16697,8 +16551,7 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_45_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [45,45,45,45] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_45_v4i64: @@ -16716,8 +16569,7 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [45,45,45,45] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -16780,7 +16632,7 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_46_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [46,46,46,46] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [46,46,46,46] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -16799,7 +16651,7 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [46,46,46,46] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [46,46,46,46] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -16863,8 +16715,7 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_46_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [46,46,46,46] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_46_v4i64: @@ -16882,8 +16733,7 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [46,46,46,46] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -16946,7 +16796,7 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_47_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [47,47,47,47] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [47,47,47,47] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -16965,7 +16815,7 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [47,47,47,47] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [47,47,47,47] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -17029,8 +16879,7 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_47_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [47,47,47,47] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_47_v4i64: @@ -17048,8 +16897,7 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [47,47,47,47] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -17112,7 +16960,7 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_48_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [48,48,48,48] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [48,48,48,48] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17131,7 +16979,7 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [48,48,48,48] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [48,48,48,48] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -17195,8 +17043,7 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_48_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [48,48,48,48] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_48_v4i64: @@ -17214,8 +17061,7 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [48,48,48,48] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -17278,7 +17124,7 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_49_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [49,49,49,49] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [49,49,49,49] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17297,7 +17143,7 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [49,49,49,49] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [49,49,49,49] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -17361,8 +17207,7 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_49_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [49,49,49,49] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_49_v4i64: @@ -17380,8 +17225,7 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [49,49,49,49] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -17444,7 +17288,7 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_50_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [50,50,50,50] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [50,50,50,50] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17463,7 +17307,7 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [50,50,50,50] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [50,50,50,50] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -17527,8 +17371,7 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_50_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [50,50,50,50] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_50_v4i64: @@ -17546,8 +17389,7 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [50,50,50,50] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -17610,7 +17452,7 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_51_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [51,51,51,51] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [51,51,51,51] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17629,7 +17471,7 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [51,51,51,51] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [51,51,51,51] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -17693,8 +17535,7 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_51_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [51,51,51,51] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_51_v4i64: @@ -17712,8 +17553,7 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [51,51,51,51] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -17776,7 +17616,7 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_52_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [52,52,52,52] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [52,52,52,52] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17795,7 +17635,7 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [52,52,52,52] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [52,52,52,52] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -17859,8 +17699,7 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_52_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [52,52,52,52] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_52_v4i64: @@ -17878,8 +17717,7 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [52,52,52,52] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -17942,7 +17780,7 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_53_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [53,53,53,53] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [53,53,53,53] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -17961,7 +17799,7 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [53,53,53,53] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [53,53,53,53] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -18025,8 +17863,7 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_53_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [53,53,53,53] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_53_v4i64: @@ -18044,8 +17881,7 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [53,53,53,53] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -18108,7 +17944,7 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_54_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [54,54,54,54] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [54,54,54,54] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18127,7 +17963,7 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [54,54,54,54] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [54,54,54,54] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -18191,8 +18027,7 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_54_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [54,54,54,54] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_54_v4i64: @@ -18210,8 +18045,7 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [54,54,54,54] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -18274,7 +18108,7 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_55_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [55,55,55,55] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [55,55,55,55] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18293,7 +18127,7 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [55,55,55,55] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [55,55,55,55] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -18357,8 +18191,7 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_55_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [55,55,55,55] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_55_v4i64: @@ -18376,8 +18209,7 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [55,55,55,55] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -18440,7 +18272,7 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_56_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [56,56,56,56] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [56,56,56,56] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18459,7 +18291,7 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [56,56,56,56] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [56,56,56,56] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -18523,8 +18355,7 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_56_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [56,56,56,56] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_56_v4i64: @@ -18542,8 +18373,7 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [56,56,56,56] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -18606,7 +18436,7 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_57_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [57,57,57,57] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [57,57,57,57] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18625,7 +18455,7 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [57,57,57,57] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [57,57,57,57] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -18689,8 +18519,7 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_57_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [57,57,57,57] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_57_v4i64: @@ -18708,8 +18537,7 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [57,57,57,57] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -18772,7 +18600,7 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_58_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [58,58,58,58] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [58,58,58,58] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18791,7 +18619,7 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [58,58,58,58] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [58,58,58,58] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -18855,8 +18683,7 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_58_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [58,58,58,58] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_58_v4i64: @@ -18874,8 +18701,7 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [58,58,58,58] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -18938,7 +18764,7 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_59_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [59,59,59,59] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [59,59,59,59] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -18957,7 +18783,7 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [59,59,59,59] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [59,59,59,59] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -19021,8 +18847,7 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_59_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [59,59,59,59] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_59_v4i64: @@ -19040,8 +18865,7 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [59,59,59,59] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -19104,7 +18928,7 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_60_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [60,60,60,60] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [60,60,60,60] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -19123,7 +18947,7 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [60,60,60,60] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [60,60,60,60] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -19187,8 +19011,7 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_60_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [60,60,60,60] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_60_v4i64: @@ -19206,8 +19029,7 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [60,60,60,60] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -19270,7 +19092,7 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_61_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [61,61,61,61] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [61,61,61,61] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -19289,7 +19111,7 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [61,61,61,61] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [61,61,61,61] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -19353,8 +19175,7 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_61_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [61,61,61,61] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_61_v4i64: @@ -19372,8 +19193,7 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [61,61,61,61] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -19436,7 +19256,7 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_62_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [62,62,62,62] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [62,62,62,62] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -19455,7 +19275,7 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [62,62,62,62] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [62,62,62,62] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) @@ -19519,8 +19339,7 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ugt_62_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [62,62,62,62] -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_62_v4i64: @@ -19538,8 +19357,7 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [62,62,62,62] -; BITALG-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ugt <4 x i64> %2, @@ -19602,7 +19420,7 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ult_63_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] +; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [63,63,63,63] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -19621,7 +19439,7 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) { ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] +; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm1 = [63,63,63,63] ; BITALG-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll index 7fb60b987d95d..d18aa3d34a3db 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -556,8 +556,7 @@ define <4 x i64> @eq_1_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: eq_1_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: eq_1_v4i64: @@ -642,8 +641,7 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQVL-LABEL: ne_1_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -722,8 +720,7 @@ define <8 x i32> @eq_1_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: eq_1_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: eq_1_v8i32: @@ -802,8 +799,7 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQVL-LABEL: ne_1_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index f434fc8c6cad8..e44a1e153afc4 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -27,30 +27,11 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) nounwind { ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v2i64_v2i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setb %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v2i64_v2i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v2i64_v2i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v2i64_v2i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v2i64_v2i1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setb %al +; AVX-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b @@ -71,30 +52,11 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) nounwind { ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v4i32_v4i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setb %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i32_v4i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i32_v4i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i32_v4i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v4i32_v4i1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setb %al +; AVX-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b @@ -116,30 +78,11 @@ define i1 @trunc_v8i16_v8i1(<8 x i16>) nounwind { ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v8i16_v8i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setb %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i16_v8i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i16_v8i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i16_v8i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [281479271743489,281479271743489] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v8i16_v8i1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setb %al +; AVX-NEXT: retq %a = trunc <8 x i16> %0 to <8 x i1> %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b @@ -160,30 +103,11 @@ define i1 @trunc_v16i8_v16i1(<16 x i8>) nounwind { ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v16i8_v16i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setb %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v16i8_v16i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v16i8_v16i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v16i8_v16i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v16i8_v16i1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setb %al +; AVX-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b @@ -221,13 +145,28 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b @@ -265,13 +204,28 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v8i32_v8i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v8i32_v8i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_v8i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <8 x i32> %0 to <8 x i1> %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b @@ -310,13 +264,28 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v16i16_v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v16i16_v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v16i16_v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v16i16_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <16 x i16> %0 to <16 x i1> %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b @@ -354,13 +323,28 @@ define i1 @trunc_v32i8_v32i1(<32 x i8>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v32i8_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v32i8_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v32i8_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v32i8_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <32 x i8> %0 to <32 x i1> %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b diff --git a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll index 29366f74da12a..d660e2b24b4bc 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll @@ -1552,7 +1552,7 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsadbw %zmm5, %zmm4, %zmm4 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,12,8,12,8,12,0,4] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,8,12,0,4] ; AVX512VL-NEXT: vpermd %zmm4, %zmm6, %zmm4 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 @@ -1586,7 +1586,7 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> ; AVX512VPOPCNT-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512VPOPCNT-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VPOPCNT-NEXT: vpsadbw %zmm5, %zmm4, %zmm4 -; AVX512VPOPCNT-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,12,8,12,8,12,0,4] +; AVX512VPOPCNT-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,8,12,0,4] ; AVX512VPOPCNT-NEXT: vpermd %zmm4, %zmm6, %zmm4 ; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index f80544fdef7e6..39798be110674 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -27,30 +27,11 @@ define i1 @trunc_v2i64_v2i1(<2 x i64>) nounwind { ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v2i64_v2i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setne %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v2i64_v2i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v2i64_v2i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v2i64_v2i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setne %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v2i64_v2i1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setne %al +; AVX-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> %b = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a) ret i1 %b @@ -71,30 +52,11 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) nounwind { ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v4i32_v4i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setne %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i32_v4i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i32_v4i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i32_v4i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setne %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v4i32_v4i1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setne %al +; AVX-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) ret i1 %b @@ -115,30 +77,11 @@ define i1 @trunc_v8i16_v8i1(<8 x i16>) nounwind { ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v8i16_v8i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setne %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i16_v8i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i16_v8i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i16_v8i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [281479271743489,281479271743489] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setne %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v8i16_v8i1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setne %al +; AVX-NEXT: retq %a = trunc <8 x i16> %0 to <8 x i1> %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b @@ -159,30 +102,11 @@ define i1 @trunc_v16i8_v16i1(<16 x i8>) nounwind { ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v16i8_v16i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setne %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v16i8_v16i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v16i8_v16i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v16i8_v16i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setne %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v16i8_v16i1: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: setne %al +; AVX-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b @@ -220,13 +144,28 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) ret i1 %b @@ -264,13 +203,28 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v8i32_v8i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v8i32_v8i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_v8i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <8 x i32> %0 to <8 x i1> %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b @@ -308,13 +262,28 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v16i16_v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v16i16_v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v16i16_v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v16i16_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <16 x i16> %0 to <16 x i1> %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b @@ -352,13 +321,28 @@ define i1 @trunc_v32i8_v32i1(<32 x i8>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v32i8_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v32i8_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v32i8_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v32i8_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <32 x i8> %0 to <32 x i1> %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a) ret i1 %b @@ -429,8 +413,7 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind { ; ; AVX512-LABEL: trunc_v8i64_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] -; AVX512-NEXT: vptestmd %zmm1, %zmm0, %k0 +; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper @@ -2500,5 +2483,3 @@ declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>) declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>) declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>) declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 9cd0f4d12e15a..0fda2f8dfdb31 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -829,30 +829,11 @@ define i1 @trunc_v2i64(<2 x i64> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: trunc_v2i64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: sete %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: sete %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v2i64: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] -; AVX512BWVL-NEXT: vptest %xmm1, %xmm0 -; AVX512BWVL-NEXT: sete %al -; AVX512BWVL-NEXT: retq +; AVX-LABEL: trunc_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0) %2 = trunc i64 %1 to i16 %3 = icmp eq i16 %2, 0 @@ -890,13 +871,28 @@ define i1 @mask_v8i32(<8 x i32> %a0) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: mask_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: mask_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mask_v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: mask_v8i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX512BWVL-NEXT: sete %al +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = and i32 %1, 2147483648 %3 = icmp eq i32 %2, 0 @@ -1032,26 +1028,12 @@ define zeroext i1 @PR44781(ptr %0) { ; AVX2-NEXT: sete %al ; AVX2-NEXT: retq ; -; AVX512F-LABEL: PR44781: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15] -; AVX512F-NEXT: vptest (%rdi), %xmm0 -; AVX512F-NEXT: sete %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: PR44781: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15] -; AVX512BW-NEXT: vptest (%rdi), %xmm0 -; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: PR44781: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [64424509455,64424509455] -; AVX512BWVL-NEXT: vptest (%rdi), %xmm0 -; AVX512BWVL-NEXT: sete %al -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: PR44781: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15] +; AVX512-NEXT: vptest (%rdi), %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: retq %2 = load <4 x i32>, ptr %0, align 4 %3 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2) %4 = and i32 %3, 15 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index c2c6a5f7eba57..48ef33f309584 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -682,7 +682,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; ; AVX512VL-LABEL: splatvar_rotate_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -704,7 +704,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; ; AVX512VLBW-LABEL: splatvar_rotate_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 8ac0b178a16df..2174b4937891a 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -330,7 +330,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; ; AVX512VL-LABEL: splatvar_rotate_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5 @@ -358,7 +358,7 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; ; AVX512VLBW-LABEL: splatvar_rotate_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] +; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index dbbfaab9ea26a..904c6dfd8ead7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1023,8 +1023,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1129,8 +1128,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,3,0,3] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1201,8 +1199,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,0,0,0,5,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1271,8 +1268,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,7,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index dbbd6b19b2829..7039e4ebe902d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1813,8 +1813,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1895,8 +1894,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpmovsxwq {{.*#+}} ymm1 = [3584,0,3584,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1936,8 +1934,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,0,15,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index bd78dbded0705..b5e41b73a7074 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1043,9 +1043,8 @@ define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_32103210: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX512VL-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_32103210: @@ -1091,9 +1090,8 @@ define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_76547654: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX512VL-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76547654: @@ -2854,9 +2852,8 @@ define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_32103210: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX512VL-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_32103210: @@ -2902,9 +2899,8 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_76547654: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX512VL-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76547654: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index fce98cd470bcd..fe4157d9c4884 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -152,19 +152,11 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { } define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { -; AVX512F-LABEL: shuffle_v8f64_08080808: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8f64_08080808: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0] -; AVX512F-32-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8f64_08080808: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } @@ -798,19 +790,11 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { } define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { -; AVX512F-LABEL: shuffle_v8i64_08080808: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512F-32-LABEL: shuffle_v8i64_08080808: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0] -; AVX512F-32-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-32-NEXT: retl +; ALL-LABEL: shuffle_v8i64_08080808: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll index 545a9d3e314a2..c662f3bda67df 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -336,9 +336,7 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){ ; ; AVX512F-LABEL: test_mm512_mask_blend_epi8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512F-NEXT: ret{{[l|q]}} entry: %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> @@ -640,8 +638,7 @@ define <32 x float> @PR47534(<8 x float> %tmp) { ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,25,26,27,7,29,30,31,7,25,26,27,7,29,30,31] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [7,0,0,0,7,0,0,0,0,25,26,27,0,29,30,31] ; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: ret{{[l|q]}} %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index ad73bb6886b9f..fcbf12aea5eec 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -140,7 +140,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] +; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [64,64] ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; @@ -341,7 +341,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] +; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [64,64] ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll index 3c35f7b7fb751..79c194469139c 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -64,7 +64,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CDVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] +; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [64,64,64,64] ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; @@ -189,7 +189,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CDVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] +; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [64,64,64,64] ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index 56001468898e4..df58a0795e1df 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -1096,12 +1096,12 @@ define <4 x i32> @widen_cttz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandn %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm2 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vplzcntd %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubd %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512VPOPCNT-LABEL: widen_cttz_v2i32_v4i32: @@ -1426,12 +1426,12 @@ define <4 x i32> @widen_cttz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandn %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm2 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vplzcntd %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubd %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512VPOPCNT-LABEL: widen_cttz_undef_v2i32_v4i32: diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll index c3700189d3d0e..59070865fd671 100644 --- a/llvm/test/CodeGen/X86/widen_fadd.ll +++ b/llvm/test/CodeGen/X86/widen_fadd.ll @@ -215,13 +215,13 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0 ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -258,8 +258,7 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2 -; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10] -; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,2,8,10] ; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 ; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll index 2d9e3f60bee46..fc744be4161c0 100644 --- a/llvm/test/CodeGen/X86/widen_fdiv.ll +++ b/llvm/test/CodeGen/X86/widen_fdiv.ll @@ -176,13 +176,13 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512F-NEXT: vdivps %xmm8, %xmm7, %xmm7 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0 ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -203,8 +203,7 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512VL-NEXT: vdivps %xmm4, %xmm3, %xmm3 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,2,8,10,0,2,8,10] -; AVX512VL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,2,8,10] ; AVX512VL-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 ; AVX512VL-NEXT: vmovups (%rdi), %ymm0 ; AVX512VL-NEXT: vdivps (%rsi), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll index 6c3e0ff5a9bcd..1af37d643615d 100644 --- a/llvm/test/CodeGen/X86/widen_fmul.ll +++ b/llvm/test/CodeGen/X86/widen_fmul.ll @@ -215,13 +215,13 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512F-NEXT: vmulps %xmm7, %xmm8, %xmm7 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0 ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -258,8 +258,7 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2 -; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10] -; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,2,8,10] ; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 ; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll index 7405d9b7b1c65..0c77dd9e114c8 100644 --- a/llvm/test/CodeGen/X86/widen_fsub.ll +++ b/llvm/test/CodeGen/X86/widen_fsub.ll @@ -215,13 +215,13 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512F-NEXT: vsubps %xmm8, %xmm7, %xmm7 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,0,0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0 ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -258,8 +258,7 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2 -; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10] -; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,0,0,0,2,8,10] ; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 ; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index eb463837c3bb8..b90b470d41582 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -6158,9 +6158,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6177,9 +6175,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,1,0,8,0,9,0] ; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6197,9 +6193,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512BW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -6213,9 +6207,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,1,0,8,0,9,0] ; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index ea0e3b3a2b9aa..3815efdee1882 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1478,10 +1478,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1493,10 +1491,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -3317,8 +3313,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -3337,8 +3332,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index a3e2fb5321f32..a6c01a095cfbf 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1205,10 +1205,9 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm0 ^ mem)) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -1216,10 +1215,9 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -2656,8 +2654,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2674,8 +2671,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2